2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #define INCLUDE_MEMORY
26 #include "coretypes.h"
33 #include "tree-pass.h"
35 #include "optabs-tree.h"
38 #include "diagnostic-core.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop-niter.h"
48 #include "tree-ssa-loop.h"
50 #include "tree-scalar-evolution.h"
51 #include "tree-vectorizer.h"
52 #include "gimple-fold.h"
55 #include "tree-if-conv.h"
56 #include "internal-fn.h"
57 #include "tree-vector-builder.h"
58 #include "vec-perm-indices.h"
60 #include "case-cfn-macros.h"
61 #include "langhooks.h"
63 /* Loop Vectorization Pass.
65 This pass tries to vectorize loops.
67 For example, the vectorizer transforms the following simple loop:
69 short a[N]; short b[N]; short c[N]; int i;
75 as if it was manually vectorized by rewriting the source code into:
77 typedef int __attribute__((mode(V8HI))) v8hi;
78 short a[N]; short b[N]; short c[N]; int i;
79 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
82 for (i=0; i<N/8; i++){
89 The main entry to this pass is vectorize_loops(), in which
90 the vectorizer applies a set of analyses on a given set of loops,
91 followed by the actual vectorization transformation for the loops that
92 had successfully passed the analysis phase.
93 Throughout this pass we make a distinction between two types of
94 data: scalars (which are represented by SSA_NAMES), and memory references
95 ("data-refs"). These two types of data require different handling both
96 during analysis and transformation. The types of data-refs that the
97 vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 accesses are required to have a simple (consecutive) access pattern.
103 The driver for the analysis phase is vect_analyze_loop().
104 It applies a set of analyses, some of which rely on the scalar evolution
105 analyzer (scev) developed by Sebastian Pop.
107 During the analysis phase the vectorizer records some information
108 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 loop, as well as general information about the loop as a whole, which is
110 recorded in a "loop_vec_info" struct attached to each loop.
112 Transformation phase:
113 =====================
114 The loop transformation phase scans all the stmts in the loop, and
115 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 the loop that needs to be vectorized. It inserts the vector code sequence
117 just before the scalar stmt S, and records a pointer to the vector code
118 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 attached to S). This pointer will be used for the vectorization of following
120 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 otherwise, we rely on dead code elimination for removing it.
123 For example, say stmt S1 was vectorized into stmt VS1:
126 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
129 To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 resulting sequence would be:
135 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 Operands that are not SSA_NAMEs, are data-refs that appear in
140 load/store operations (like 'x[i]' in S1), and are handled differently.
144 Currently the only target specific information that is used is the
145 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 Targets that can support different sizes of vectors, for now will need
147 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 flexibility will be added in the future.
150 Since we only vectorize operations which vector form can be
151 expressed using existing tree codes, to verify that an operation is
152 supported, the vectorizer checks the relevant optab at the relevant
153 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 the value found is CODE_FOR_nothing, then there's no target support, and
155 we can't vectorize the stmt.
157 For additional information on this project see:
158 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
161 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *,
163 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
164 bool *, bool *, bool);
166 /* Subroutine of vect_determine_vf_for_stmt that handles only one
167 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
168 may already be set for general statements (not just data refs). */
171 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
172 bool vectype_maybe_set_p
,
175 gimple
*stmt
= stmt_info
->stmt
;
177 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
178 && !STMT_VINFO_LIVE_P (stmt_info
))
179 || gimple_clobber_p (stmt
))
181 if (dump_enabled_p ())
182 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
183 return opt_result::success ();
186 tree stmt_vectype
, nunits_vectype
;
187 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
195 if (STMT_VINFO_VECTYPE (stmt_info
))
196 /* The only case when a vectype had been already set is for stmts
197 that contain a data ref, or for "pattern-stmts" (stmts generated
198 by the vectorizer to represent/replace a certain idiom). */
199 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
200 || vectype_maybe_set_p
)
201 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
203 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
207 vect_update_max_nunits (vf
, nunits_vectype
);
209 return opt_result::success ();
212 /* Subroutine of vect_determine_vectorization_factor. Set the vector
213 types of STMT_INFO and all attached pattern statements and update
214 the vectorization factor VF accordingly. Return true on success
215 or false if something prevented vectorization. */
218 vect_determine_vf_for_stmt (vec_info
*vinfo
,
219 stmt_vec_info stmt_info
, poly_uint64
*vf
)
221 if (dump_enabled_p ())
222 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
224 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
228 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
229 && STMT_VINFO_RELATED_STMT (stmt_info
))
231 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
232 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
234 /* If a pattern statement has def stmts, analyze them too. */
235 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
236 !gsi_end_p (si
); gsi_next (&si
))
238 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
239 if (dump_enabled_p ())
240 dump_printf_loc (MSG_NOTE
, vect_location
,
241 "==> examining pattern def stmt: %G",
242 def_stmt_info
->stmt
);
243 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
248 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE
, vect_location
,
250 "==> examining pattern statement: %G",
252 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
257 return opt_result::success ();
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
288 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
289 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
290 unsigned nbbs
= loop
->num_nodes
;
291 poly_uint64 vectorization_factor
= 1;
292 tree scalar_type
= NULL_TREE
;
295 stmt_vec_info stmt_info
;
298 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
300 for (i
= 0; i
< nbbs
; i
++)
302 basic_block bb
= bbs
[i
];
304 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
308 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
309 if (dump_enabled_p ())
310 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
313 gcc_assert (stmt_info
);
315 if (STMT_VINFO_RELEVANT_P (stmt_info
)
316 || STMT_VINFO_LIVE_P (stmt_info
))
318 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
319 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
321 if (dump_enabled_p ())
322 dump_printf_loc (MSG_NOTE
, vect_location
,
323 "get vectype for scalar type: %T\n",
326 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
328 return opt_result::failure_at (phi
,
329 "not vectorized: unsupported "
332 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
338 if (dump_enabled_p ())
340 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
341 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
342 dump_printf (MSG_NOTE
, "\n");
345 vect_update_max_nunits (&vectorization_factor
, vectype
);
349 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
352 if (is_gimple_debug (gsi_stmt (si
)))
354 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
356 = vect_determine_vf_for_stmt (loop_vinfo
,
357 stmt_info
, &vectorization_factor
);
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
367 dump_dec (MSG_NOTE
, vectorization_factor
);
368 dump_printf (MSG_NOTE
, "\n");
371 if (known_le (vectorization_factor
, 1U))
372 return opt_result::failure_at (vect_location
,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
375 return opt_result::success ();
379 /* Function vect_is_simple_iv_evolution.
381 FORNOW: A simple evolution of an induction variables in the loop is
382 considered a polynomial evolution. */
385 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
390 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
393 /* When there is no evolution in this loop, the evolution function
395 if (evolution_part
== NULL_TREE
)
398 /* When the evolution is a polynomial of degree >= 2
399 the evolution function is not "simple". */
400 if (tree_is_chrec (evolution_part
))
403 step_expr
= evolution_part
;
404 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
406 if (dump_enabled_p ())
407 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
408 step_expr
, init_expr
);
413 if (TREE_CODE (step_expr
) != INTEGER_CST
414 && (TREE_CODE (step_expr
) != SSA_NAME
415 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
416 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
417 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
418 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
419 || !flag_associative_math
)))
420 && (TREE_CODE (step_expr
) != REAL_CST
421 || !flag_associative_math
))
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
432 /* Function vect_is_nonlinear_iv_evolution
434 Only support nonlinear induction for integer type
437 3. lshift/rshift by constant.
439 For neg induction, return a fake step as integer -1. */
441 vect_is_nonlinear_iv_evolution (class loop
* loop
, stmt_vec_info stmt_info
,
442 gphi
* loop_phi_node
, tree
*init
, tree
*step
)
444 tree init_expr
, ev_expr
, result
, op1
, op2
;
447 if (gimple_phi_num_args (loop_phi_node
) != 2)
450 init_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_preheader_edge (loop
));
451 ev_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_latch_edge (loop
));
453 /* Support nonlinear induction only for integer type. */
454 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr
)))
458 result
= PHI_RESULT (loop_phi_node
);
460 if (TREE_CODE (ev_expr
) != SSA_NAME
461 || ((def
= SSA_NAME_DEF_STMT (ev_expr
)), false)
462 || !is_gimple_assign (def
))
465 enum tree_code t_code
= gimple_assign_rhs_code (def
);
469 if (gimple_assign_rhs1 (def
) != result
)
471 *step
= build_int_cst (TREE_TYPE (init_expr
), -1);
472 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_neg
;
478 op1
= gimple_assign_rhs1 (def
);
479 op2
= gimple_assign_rhs2 (def
);
480 if (TREE_CODE (op2
) != INTEGER_CST
484 if (t_code
== LSHIFT_EXPR
)
485 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shl
;
486 else if (t_code
== RSHIFT_EXPR
)
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shr
;
488 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
490 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_mul
;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info
) = *init
;
498 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
) = *step
;
503 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
504 what we are assuming is a double reduction. For example, given
505 a structure like this:
508 x_1 = PHI <x_4(outer2), ...>;
512 x_2 = PHI <x_1(outer1), ...>;
518 x_4 = PHI <x_3(inner)>;
521 outer loop analysis would treat x_1 as a double reduction phi and
522 this function would then return true for x_2. */
525 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
529 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
530 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
531 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
536 /* Returns true if Phi is a first-order recurrence. A first-order
537 recurrence is a non-reduction recurrence relation in which the value of
538 the recurrence in the current loop iteration equals a value defined in
539 the previous iteration. */
542 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo
, class loop
*loop
,
545 /* A nested cycle isn't vectorizable as first order recurrence. */
546 if (LOOP_VINFO_LOOP (loop_vinfo
) != loop
)
549 /* Ensure the loop latch definition is from within the loop. */
550 edge latch
= loop_latch_edge (loop
);
551 tree ldef
= PHI_ARG_DEF_FROM_EDGE (phi
, latch
);
552 if (TREE_CODE (ldef
) != SSA_NAME
553 || SSA_NAME_IS_DEFAULT_DEF (ldef
)
554 || is_a
<gphi
*> (SSA_NAME_DEF_STMT (ldef
))
555 || !flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (ldef
))))
558 tree def
= gimple_phi_result (phi
);
560 /* Ensure every use_stmt of the phi node is dominated by the latch
562 imm_use_iterator imm_iter
;
564 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, def
)
565 if (!is_gimple_debug (USE_STMT (use_p
))
566 && (SSA_NAME_DEF_STMT (ldef
) == USE_STMT (use_p
)
567 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef
),
571 /* First-order recurrence autovectorization needs shuffle vector. */
572 tree scalar_type
= TREE_TYPE (def
);
573 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
580 /* Function vect_analyze_scalar_cycles_1.
582 Examine the cross iteration def-use cycles of scalar variables
583 in LOOP. LOOP_VINFO represents the loop that is now being
584 considered for vectorization (can be LOOP, or an outer-loop
585 enclosing LOOP). SLP indicates there will be some subsequent
586 slp analyses or not. */
589 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
,
592 basic_block bb
= loop
->header
;
594 auto_vec
<stmt_vec_info
, 64> worklist
;
596 bool double_reduc
, reduc_chain
;
598 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
600 /* First - identify all inductions. Reduction detection assumes that all the
601 inductions have been identified, therefore, this order must not be
603 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
605 gphi
*phi
= gsi
.phi ();
606 tree access_fn
= NULL
;
607 tree def
= PHI_RESULT (phi
);
608 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
610 if (dump_enabled_p ())
611 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
614 /* Skip virtual phi's. The data dependences that are associated with
615 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
616 if (virtual_operand_p (def
))
619 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
621 /* Analyze the evolution function. */
622 access_fn
= analyze_scalar_evolution (loop
, def
);
625 STRIP_NOPS (access_fn
);
626 if (dump_enabled_p ())
627 dump_printf_loc (MSG_NOTE
, vect_location
,
628 "Access function of PHI: %T\n", access_fn
);
629 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
630 = initial_condition_in_loop_num (access_fn
, loop
->num
);
631 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
632 = evolution_part_in_loop_num (access_fn
, loop
->num
);
636 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
637 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
,
639 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
640 && TREE_CODE (step
) != INTEGER_CST
))
641 /* Only handle nonlinear iv for same loop. */
642 && (LOOP_VINFO_LOOP (loop_vinfo
) != loop
643 || !vect_is_nonlinear_iv_evolution (loop
, stmt_vinfo
,
646 worklist
.safe_push (stmt_vinfo
);
650 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
652 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
656 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
660 /* Second - identify all reductions and nested cycles. */
661 while (worklist
.length () > 0)
663 stmt_vec_info stmt_vinfo
= worklist
.pop ();
664 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
665 tree def
= PHI_RESULT (phi
);
667 if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
671 gcc_assert (!virtual_operand_p (def
)
672 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
674 stmt_vec_info reduc_stmt_info
675 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
679 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
680 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
683 if (dump_enabled_p ())
684 dump_printf_loc (MSG_NOTE
, vect_location
,
685 "Detected double reduction.\n");
687 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
688 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
689 /* Make it accessible for SLP vectorization. */
690 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push (reduc_stmt_info
);
694 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE
, vect_location
,
698 "Detected vectorizable nested cycle.\n");
700 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_NOTE
, vect_location
,
706 "Detected reduction.\n");
708 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
709 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
710 /* Store the reduction cycles for possible vectorization in
711 loop-aware SLP if it was not detected as reduction
714 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
719 else if (vect_phi_first_order_recurrence_p (loop_vinfo
, loop
, phi
))
720 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_first_order_recurrence
;
722 if (dump_enabled_p ())
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
724 "Unknown def-use cycle pattern.\n");
729 /* Function vect_analyze_scalar_cycles.
731 Examine the cross iteration def-use cycles of scalar variables, by
732 analyzing the loop-header PHIs of scalar variables. Classify each
733 cycle as one of the following: invariant, induction, reduction, unknown.
734 We do that for the loop represented by LOOP_VINFO, and also to its
735 inner-loop, if exists.
736 Examples for scalar cycles:
751 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
, bool slp
)
753 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
755 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
, slp
);
757 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
758 Reductions in such inner-loop therefore have different properties than
759 the reductions in the nest that gets vectorized:
760 1. When vectorized, they are executed in the same order as in the original
761 scalar loop, so we can't change the order of computation when
763 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
764 current checks are too strict. */
767 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
, slp
);
770 /* Transfer group and reduction information from STMT_INFO to its
774 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
776 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
778 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
779 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
780 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
783 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
784 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
785 == STMT_VINFO_DEF_TYPE (stmt_info
));
786 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
787 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
789 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
790 = STMT_VINFO_RELATED_STMT (stmt_info
);
795 /* Fixup scalar cycles that now have their stmts detected as patterns. */
798 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
803 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
805 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
808 if ((STMT_VINFO_IN_PATTERN_P (next
)
809 != STMT_VINFO_IN_PATTERN_P (first
))
810 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
812 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
814 /* If all reduction chain members are well-formed patterns adjust
815 the group to group the pattern stmts instead. */
817 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
819 if (STMT_VINFO_IN_PATTERN_P (first
))
821 vect_fixup_reduc_chain (first
);
822 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
823 = STMT_VINFO_RELATED_STMT (first
);
826 /* If not all stmt in the chain are patterns or if we failed
827 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
828 it as regular reduction instead. */
831 stmt_vec_info vinfo
= first
;
832 stmt_vec_info last
= NULL
;
835 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
836 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
837 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
841 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
843 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
844 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
850 /* Function vect_get_loop_niters.
852 Determine how many iterations the loop is executed and place it
853 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
854 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
855 niter information holds in ASSUMPTIONS.
857 Return the loop exit conditions. */
861 vect_get_loop_niters (class loop
*loop
, const_edge main_exit
, tree
*assumptions
,
862 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
864 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
866 conds
.create (exits
.length ());
867 class tree_niter_desc niter_desc
;
868 tree niter_assumptions
, niter
, may_be_zero
;
870 *assumptions
= boolean_true_node
;
871 *number_of_iterationsm1
= chrec_dont_know
;
872 *number_of_iterations
= chrec_dont_know
;
874 DUMP_VECT_SCOPE ("get_loop_niters");
876 if (exits
.is_empty ())
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_NOTE
, vect_location
, "Loop has %d exits.\n",
885 FOR_EACH_VEC_ELT (exits
, i
, exit
)
887 gcond
*cond
= get_loop_exit_condition (exit
);
889 conds
.safe_push (cond
);
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyzing exit %d...\n", i
);
894 if (exit
!= main_exit
)
897 may_be_zero
= NULL_TREE
;
898 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
899 || chrec_contains_undetermined (niter_desc
.niter
))
902 niter_assumptions
= niter_desc
.assumptions
;
903 may_be_zero
= niter_desc
.may_be_zero
;
904 niter
= niter_desc
.niter
;
906 if (may_be_zero
&& integer_zerop (may_be_zero
))
907 may_be_zero
= NULL_TREE
;
911 if (COMPARISON_CLASS_P (may_be_zero
))
913 /* Try to combine may_be_zero with assumptions, this can simplify
914 computation of niter expression. */
915 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
916 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
918 fold_build1 (TRUTH_NOT_EXPR
,
922 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
923 build_int_cst (TREE_TYPE (niter
), 0),
924 rewrite_to_non_trapping_overflow (niter
));
926 may_be_zero
= NULL_TREE
;
928 else if (integer_nonzerop (may_be_zero
))
930 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
931 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
938 /* Loop assumptions are based off the normal exit. */
939 *assumptions
= niter_assumptions
;
940 *number_of_iterationsm1
= niter
;
942 /* We want the number of loop header executions which is the number
943 of latch executions plus one.
944 ??? For UINT_MAX latch executions this number overflows to zero
945 for loops like do { n++; } while (n != 0); */
946 if (niter
&& !chrec_contains_undetermined (niter
))
948 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
),
949 unshare_expr (niter
),
950 build_int_cst (TREE_TYPE (niter
), 1));
951 if (TREE_CODE (niter
) == INTEGER_CST
952 && TREE_CODE (*number_of_iterationsm1
) != INTEGER_CST
)
954 /* If we manage to fold niter + 1 into INTEGER_CST even when
955 niter is some complex expression, ensure back
956 *number_of_iterationsm1 is an INTEGER_CST as well. See
958 *number_of_iterationsm1
959 = fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), niter
,
960 build_minus_one_cst (TREE_TYPE (niter
)));
963 *number_of_iterations
= niter
;
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE
, vect_location
, "All loop exits successfully analyzed.\n");
972 /* Determine the main loop exit for the vectorizer. */
975 vec_init_loop_exit_info (class loop
*loop
)
977 /* Before we begin we must first determine which exit is the main one and
978 which are auxilary exits. */
979 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
980 if (exits
.length () == 1)
983 /* If we have multiple exits we only support counting IV at the moment.
984 Analyze all exits and return the last one we can analyze. */
985 class tree_niter_desc niter_desc
;
986 edge candidate
= NULL
;
987 for (edge exit
: exits
)
989 if (!get_loop_exit_condition (exit
))
992 if (number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
993 && !chrec_contains_undetermined (niter_desc
.niter
))
995 tree may_be_zero
= niter_desc
.may_be_zero
;
996 if ((integer_zerop (may_be_zero
)
997 /* As we are handling may_be_zero that's not false by
998 rewriting niter to may_be_zero ? 0 : niter we require
1000 || (single_pred_p (loop
->latch
)
1001 && exit
->src
== single_pred (loop
->latch
)
1002 && (integer_nonzerop (may_be_zero
)
1003 || COMPARISON_CLASS_P (may_be_zero
))))
1005 || dominated_by_p (CDI_DOMINATORS
, exit
->src
,
1014 /* Function bb_in_loop_p
1016 Used as predicate for dfs order traversal of the loop bbs. */
1019 bb_in_loop_p (const_basic_block bb
, const void *data
)
1021 const class loop
*const loop
= (const class loop
*)data
;
1022 if (flow_bb_inside_loop_p (loop
, bb
))
1028 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1029 stmt_vec_info structs for all the stmts in LOOP_IN. */
1031 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
1032 : vec_info (vec_info::loop
, shared
),
1034 num_itersm1 (NULL_TREE
),
1035 num_iters (NULL_TREE
),
1036 num_iters_unchanged (NULL_TREE
),
1037 num_iters_assumptions (NULL_TREE
),
1038 vector_costs (nullptr),
1039 scalar_costs (nullptr),
1041 versioning_threshold (0),
1042 vectorization_factor (0),
1043 main_loop_edge (nullptr),
1044 skip_main_loop_edge (nullptr),
1045 skip_this_loop_edge (nullptr),
1046 reusable_accumulators (),
1047 suggested_unroll_factor (1),
1048 max_vectorization_factor (0),
1049 mask_skip_niters (NULL_TREE
),
1050 rgroup_compare_type (NULL_TREE
),
1051 simd_if_cond (NULL_TREE
),
1052 partial_vector_style (vect_partial_vectors_none
),
1053 unaligned_dr (NULL
),
1054 peeling_for_alignment (0),
1058 slp_unrolling_factor (1),
1059 inner_loop_cost_factor (param_vect_inner_loop_cost_factor
),
1060 vectorizable (false),
1061 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
1062 using_partial_vectors_p (false),
1063 using_decrementing_iv_p (false),
1064 using_select_vl_p (false),
1065 epil_using_partial_vectors_p (false),
1066 partial_load_store_bias (0),
1067 peeling_for_gaps (false),
1068 peeling_for_niter (false),
1069 early_breaks (false),
1070 no_data_dependencies (false),
1071 has_mask_store (false),
1072 scalar_loop_scaling (profile_probability::uninitialized ()),
1074 orig_loop_info (NULL
),
1075 vec_loop_iv_exit (NULL
),
1076 vec_epilogue_loop_iv_exit (NULL
),
1077 scalar_loop_iv_exit (NULL
)
1079 /* CHECKME: We want to visit all BBs before their successors (except for
1080 latch blocks, for which this assertion wouldn't hold). In the simple
1081 case of the loop forms we allow, a dfs order of the BBs would the same
1082 as reversed postorder traversal, so we are safe. */
1084 bbs
= XCNEWVEC (basic_block
, loop
->num_nodes
);
1085 nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
, bbs
,
1086 loop
->num_nodes
, loop
);
1087 gcc_assert (nbbs
== loop
->num_nodes
);
1089 for (unsigned int i
= 0; i
< nbbs
; i
++)
1091 basic_block bb
= bbs
[i
];
1092 gimple_stmt_iterator si
;
1094 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1096 gimple
*phi
= gsi_stmt (si
);
1097 gimple_set_uid (phi
, 0);
1101 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1103 gimple
*stmt
= gsi_stmt (si
);
1104 gimple_set_uid (stmt
, 0);
1105 if (is_gimple_debug (stmt
))
1108 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1109 third argument is the #pragma omp simd if (x) condition, when 0,
1110 loop shouldn't be vectorized, when non-zero constant, it should
1111 be vectorized normally, otherwise versioned with vectorized loop
1112 done if the condition is non-zero at runtime. */
1113 if (loop_in
->simduid
1114 && is_gimple_call (stmt
)
1115 && gimple_call_internal_p (stmt
)
1116 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
1117 && gimple_call_num_args (stmt
) >= 3
1118 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
1119 && (loop_in
->simduid
1120 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
1122 tree arg
= gimple_call_arg (stmt
, 2);
1123 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
1126 gcc_assert (integer_nonzerop (arg
));
1131 epilogue_vinfos
.create (6);
1134 /* Free all levels of rgroup CONTROLS. */
1137 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
1139 rgroup_controls
*rgc
;
1141 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
1142 rgc
->controls
.release ();
1143 controls
->release ();
1146 /* Free all memory used by the _loop_vec_info, as well as all the
1147 stmt_vec_info structs of all the stmts in the loop. */
1149 _loop_vec_info::~_loop_vec_info ()
1153 release_vec_loop_controls (&masks
.rgc_vec
);
1154 release_vec_loop_controls (&lens
);
1157 epilogue_vinfos
.release ();
1158 delete scalar_costs
;
1159 delete vector_costs
;
1161 /* When we release an epiloge vinfo that we do not intend to use
1162 avoid clearing AUX of the main loop which should continue to
1163 point to the main loop vinfo since otherwise we'll leak that. */
1164 if (loop
->aux
== this)
1168 /* Return an invariant or register for EXPR and emit necessary
1169 computations in the LOOP_VINFO loop preheader. */
1172 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
1174 if (is_gimple_reg (expr
)
1175 || is_gimple_min_invariant (expr
))
1178 if (! loop_vinfo
->ivexpr_map
)
1179 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
1180 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
1183 gimple_seq stmts
= NULL
;
1184 cached
= force_gimple_operand (unshare_expr (expr
),
1185 &stmts
, true, NULL_TREE
);
1188 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1189 gsi_insert_seq_on_edge_immediate (e
, stmts
);
1195 /* Return true if we can use CMP_TYPE as the comparison type to produce
1196 all masks required to mask LOOP_VINFO. */
1199 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1201 rgroup_controls
*rgm
;
1203 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
, i
, rgm
)
1204 if (rgm
->type
!= NULL_TREE
1205 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1206 cmp_type
, rgm
->type
,
1207 OPTIMIZE_FOR_SPEED
))
1212 /* Calculate the maximum number of scalars per iteration for every
1213 rgroup in LOOP_VINFO. */
1216 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1218 unsigned int res
= 1;
1220 rgroup_controls
*rgm
;
1221 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
, i
, rgm
)
1222 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1226 /* Calculate the minimum precision necessary to represent:
1230 as an unsigned integer, where MAX_NITERS is the maximum number of
1231 loop header iterations for the original scalar form of LOOP_VINFO. */
1234 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1236 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1238 /* Get the maximum number of iterations that is representable
1239 in the counter type. */
1240 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1241 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1243 /* Get a more refined estimate for the number of iterations. */
1244 widest_int max_back_edges
;
1245 if (max_loop_iterations (loop
, &max_back_edges
))
1246 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1248 /* Work out how many bits we need to represent the limit. */
1249 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1252 /* True if the loop needs peeling or partial vectors when vectorized. */
1255 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1257 unsigned HOST_WIDE_INT const_vf
;
1258 HOST_WIDE_INT max_niter
1259 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1261 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1262 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1263 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1267 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1269 /* Work out the (constant) number of iterations that need to be
1270 peeled for reasons other than niters. */
1271 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1272 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1274 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1275 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1278 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1279 /* ??? When peeling for gaps but not alignment, we could
1280 try to check whether the (variable) niters is known to be
1281 VF * N + 1. That's something of a niche case though. */
1282 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1283 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1284 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1285 < (unsigned) exact_log2 (const_vf
))
1286 /* In case of versioning, check if the maximum number of
1287 iterations is greater than th. If they are identical,
1288 the epilogue is unnecessary. */
1289 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1290 || ((unsigned HOST_WIDE_INT
) max_niter
1291 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1292 but that's only computed later based on our result.
1293 The following is the most conservative approximation. */
1294 > (std::max ((unsigned HOST_WIDE_INT
) th
,
1295 const_vf
) / const_vf
) * const_vf
))))
1301 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1302 whether we can actually generate the masks required. Return true if so,
1303 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1306 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1308 unsigned int min_ni_width
;
1310 /* Use a normal loop if there are no statements that need masking.
1311 This only happens in rare degenerate cases: it means that the loop
1312 has no loads, no stores, and no live-out values. */
1313 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1316 /* Produce the rgroup controls. */
1317 for (auto mask
: LOOP_VINFO_MASKS (loop_vinfo
).mask_set
)
1319 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
1320 tree vectype
= mask
.first
;
1321 unsigned nvectors
= mask
.second
;
1323 if (masks
->rgc_vec
.length () < nvectors
)
1324 masks
->rgc_vec
.safe_grow_cleared (nvectors
, true);
1325 rgroup_controls
*rgm
= &(*masks
).rgc_vec
[nvectors
- 1];
1326 /* The number of scalars per iteration and the number of vectors are
1327 both compile-time constants. */
1328 unsigned int nscalars_per_iter
1329 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
1330 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
1332 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
1334 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
1335 rgm
->type
= truth_type_for (vectype
);
1340 unsigned int max_nscalars_per_iter
1341 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1343 /* Work out how many bits we need to represent the limit. */
1345 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1347 /* Find a scalar mode for which WHILE_ULT is supported. */
1348 opt_scalar_int_mode cmp_mode_iter
;
1349 tree cmp_type
= NULL_TREE
;
1350 tree iv_type
= NULL_TREE
;
1351 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1352 unsigned int iv_precision
= UINT_MAX
;
1355 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1358 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1360 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1361 if (cmp_bits
>= min_ni_width
1362 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1364 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1366 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1368 /* Although we could stop as soon as we find a valid mode,
1369 there are at least two reasons why that's not always the
1372 - An IV that's Pmode or wider is more likely to be reusable
1373 in address calculations than an IV that's narrower than
1376 - Doing the comparison in IV_PRECISION or wider allows
1377 a natural 0-based IV, whereas using a narrower comparison
1378 type requires mitigations against wrap-around.
1380 Conversely, if the IV limit is variable, doing the comparison
1381 in a wider type than the original type can introduce
1382 unnecessary extensions, so picking the widest valid mode
1383 is not always a good choice either.
1385 Here we prefer the first IV type that's Pmode or wider,
1386 and the first comparison type that's IV_PRECISION or wider.
1387 (The comparison type must be no wider than the IV type,
1388 to avoid extensions in the vector loop.)
1390 ??? We might want to try continuing beyond Pmode for ILP32
1391 targets if CMP_BITS < IV_PRECISION. */
1392 iv_type
= this_type
;
1393 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1394 cmp_type
= this_type
;
1395 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1403 LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.release ();
1407 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1408 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1409 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_while_ult
;
1413 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1414 whether we can actually generate AVX512 style masks. Return true if so,
1415 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1418 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo
)
1420 /* Produce differently organized rgc_vec and differently check
1421 we can produce masks. */
1423 /* Use a normal loop if there are no statements that need masking.
1424 This only happens in rare degenerate cases: it means that the loop
1425 has no loads, no stores, and no live-out values. */
1426 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1429 /* For the decrementing IV we need to represent all values in
1430 [0, niter + niter_skip] where niter_skip is the elements we
1431 skip in the first iteration for prologue peeling. */
1432 tree iv_type
= NULL_TREE
;
1433 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1434 unsigned int iv_precision
= UINT_MAX
;
1436 iv_precision
= wi::min_precision (iv_limit
, UNSIGNED
);
1438 /* First compute the type for the IV we use to track the remaining
1439 scalar iterations. */
1440 opt_scalar_int_mode cmp_mode_iter
;
1441 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1443 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1444 if (cmp_bits
>= iv_precision
1445 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1447 iv_type
= build_nonstandard_integer_type (cmp_bits
, true);
1455 /* Produce the rgroup controls. */
1456 for (auto const &mask
: LOOP_VINFO_MASKS (loop_vinfo
).mask_set
)
1458 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
1459 tree vectype
= mask
.first
;
1460 unsigned nvectors
= mask
.second
;
1462 /* The number of scalars per iteration and the number of vectors are
1463 both compile-time constants. */
1464 unsigned int nscalars_per_iter
1465 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
1466 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
1468 /* We index the rgroup_controls vector with nscalars_per_iter
1469 which we keep constant and instead have a varying nvectors,
1470 remembering the vector mask with the fewest nV. */
1471 if (masks
->rgc_vec
.length () < nscalars_per_iter
)
1472 masks
->rgc_vec
.safe_grow_cleared (nscalars_per_iter
, true);
1473 rgroup_controls
*rgm
= &(*masks
).rgc_vec
[nscalars_per_iter
- 1];
1475 if (!rgm
->type
|| rgm
->factor
> nvectors
)
1477 rgm
->type
= truth_type_for (vectype
);
1478 rgm
->compare_type
= NULL_TREE
;
1479 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
1480 rgm
->factor
= nvectors
;
1481 rgm
->bias_adjusted_ctrl
= NULL_TREE
;
1485 /* There is no fixed compare type we are going to use but we have to
1486 be able to get at one for each mask group. */
1487 unsigned int min_ni_width
1488 = wi::min_precision (vect_max_vf (loop_vinfo
), UNSIGNED
);
1491 for (auto &rgc
: LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
)
1493 tree mask_type
= rgc
.type
;
1497 /* For now vect_get_loop_mask only supports integer mode masks
1498 when we need to split it. */
1499 if (GET_MODE_CLASS (TYPE_MODE (mask_type
)) != MODE_INT
1500 || TYPE_PRECISION (TREE_TYPE (mask_type
)) != 1)
1506 /* If iv_type is usable as compare type use that - we can elide the
1507 saturation in that case. */
1508 if (TYPE_PRECISION (iv_type
) >= min_ni_width
)
1511 = build_vector_type (iv_type
, TYPE_VECTOR_SUBPARTS (mask_type
));
1512 if (expand_vec_cmp_expr_p (cmp_vectype
, mask_type
, LT_EXPR
))
1513 rgc
.compare_type
= cmp_vectype
;
1515 if (!rgc
.compare_type
)
1516 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1518 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1519 if (cmp_bits
>= min_ni_width
1520 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1522 tree cmp_type
= build_nonstandard_integer_type (cmp_bits
, true);
1526 /* Check whether we can produce the mask with cmp_type. */
1528 = build_vector_type (cmp_type
, TYPE_VECTOR_SUBPARTS (mask_type
));
1529 if (expand_vec_cmp_expr_p (cmp_vectype
, mask_type
, LT_EXPR
))
1531 rgc
.compare_type
= cmp_vectype
;
1536 if (!rgc
.compare_type
)
1544 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
);
1548 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = error_mark_node
;
1549 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1550 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_avx512
;
1554 /* Check whether we can use vector access with length based on precison
1555 comparison. So far, to keep it simple, we only allow the case that the
1556 precision of the target supported length is larger than the precision
1557 required by loop niters. */
1560 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1562 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1565 machine_mode len_load_mode
, len_store_mode
;
1566 if (!get_len_load_store_mode (loop_vinfo
->vector_mode
, true)
1567 .exists (&len_load_mode
))
1569 if (!get_len_load_store_mode (loop_vinfo
->vector_mode
, false)
1570 .exists (&len_store_mode
))
1573 signed char partial_load_bias
= internal_len_load_store_bias
1574 (IFN_LEN_LOAD
, len_load_mode
);
1576 signed char partial_store_bias
= internal_len_load_store_bias
1577 (IFN_LEN_STORE
, len_store_mode
);
1579 gcc_assert (partial_load_bias
== partial_store_bias
);
1581 if (partial_load_bias
== VECT_PARTIAL_BIAS_UNSUPPORTED
)
1584 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1585 len_loads with a length of zero. In order to avoid that we prohibit
1586 more than one loop length here. */
1587 if (partial_load_bias
== -1
1588 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
1591 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) = partial_load_bias
;
1593 unsigned int max_nitems_per_iter
= 1;
1595 rgroup_controls
*rgl
;
1596 /* Find the maximum number of items per iteration for every rgroup. */
1597 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1599 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1600 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1603 /* Work out how many bits we need to represent the length limit. */
1604 unsigned int min_ni_prec
1605 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1607 /* Now use the maximum of below precisions for one suitable IV type:
1608 - the IV's natural precision
1609 - the precision needed to hold: the maximum number of scalar
1610 iterations multiplied by the scale factor (min_ni_prec above)
1611 - the Pmode precision
1613 If min_ni_prec is less than the precision of the current niters,
1614 we perfer to still use the niters type. Prefer to use Pmode and
1615 wider IV to avoid narrow conversions. */
1617 unsigned int ni_prec
1618 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1619 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1620 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1622 tree iv_type
= NULL_TREE
;
1623 opt_scalar_int_mode tmode_iter
;
1624 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1626 scalar_mode tmode
= tmode_iter
.require ();
1627 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1629 /* ??? Do we really want to construct one IV whose precision exceeds
1631 if (tbits
> BITS_PER_WORD
)
1634 /* Find the first available standard integral type. */
1635 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1637 iv_type
= build_nonstandard_integer_type (tbits
, true);
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1646 "can't vectorize with length-based partial vectors"
1647 " because there is no suitable iv type.\n");
1651 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1652 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1653 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) = vect_partial_vectors_len
;
1658 /* Calculate the cost of one scalar iteration of the loop. */
1660 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1662 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1663 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1664 int nbbs
= loop
->num_nodes
, factor
;
1665 int innerloop_iters
, i
;
1667 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1669 /* Gather costs for statements in the scalar loop. */
1672 innerloop_iters
= 1;
1674 innerloop_iters
= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
);
1676 for (i
= 0; i
< nbbs
; i
++)
1678 gimple_stmt_iterator si
;
1679 basic_block bb
= bbs
[i
];
1681 if (bb
->loop_father
== loop
->inner
)
1682 factor
= innerloop_iters
;
1686 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1688 gimple
*stmt
= gsi_stmt (si
);
1689 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1691 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1694 /* Skip stmts that are not vectorized inside the loop. */
1695 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1696 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1697 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1698 || !VECTORIZABLE_CYCLE_DEF
1699 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1702 vect_cost_for_stmt kind
;
1703 if (STMT_VINFO_DATA_REF (stmt_info
))
1705 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1708 kind
= scalar_store
;
1710 else if (vect_nop_conversion_p (stmt_info
))
1715 /* We are using vect_prologue here to avoid scaling twice
1716 by the inner loop factor. */
1717 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1718 factor
, kind
, stmt_info
, 0, vect_prologue
);
1722 /* Now accumulate cost. */
1723 loop_vinfo
->scalar_costs
= init_cost (loop_vinfo
, true);
1724 add_stmt_costs (loop_vinfo
->scalar_costs
,
1725 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
));
1726 loop_vinfo
->scalar_costs
->finish_cost (nullptr);
1729 /* Function vect_analyze_loop_form.
1731 Verify that certain CFG restrictions hold, including:
1732 - the loop has a pre-header
1733 - the loop has a single entry
1734 - nested loops can have only a single exit.
1735 - the loop exit condition is simple enough
1736 - the number of iterations can be analyzed, i.e, a countable loop. The
1737 niter could be analyzed under some assumptions. */
1740 vect_analyze_loop_form (class loop
*loop
, gimple
*loop_vectorized_call
,
1741 vect_loop_form_info
*info
)
1743 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1745 edge exit_e
= vec_init_loop_exit_info (loop
);
1747 return opt_result::failure_at (vect_location
,
1749 " could not determine main exit from"
1750 " loop with multiple exits.\n");
1751 if (loop_vectorized_call
)
1753 tree arg
= gimple_call_arg (loop_vectorized_call
, 1);
1754 class loop
*scalar_loop
= get_loop (cfun
, tree_to_shwi (arg
));
1755 edge scalar_exit_e
= vec_init_loop_exit_info (scalar_loop
);
1757 return opt_result::failure_at (vect_location
,
1759 " could not determine main exit from"
1760 " loop with multiple exits.\n");
1763 info
->loop_exit
= exit_e
;
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE
, vect_location
,
1766 "using as main loop exit: %d -> %d [AUX: %p]\n",
1767 exit_e
->src
->index
, exit_e
->dest
->index
, exit_e
->aux
);
1769 /* Check if we have any control flow that doesn't leave the loop. */
1770 basic_block
*bbs
= get_loop_body (loop
);
1771 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1772 if (EDGE_COUNT (bbs
[i
]->succs
) != 1
1773 && (EDGE_COUNT (bbs
[i
]->succs
) != 2
1774 || !loop_exits_from_bb_p (bbs
[i
]->loop_father
, bbs
[i
])))
1777 return opt_result::failure_at (vect_location
,
1779 " unsupported control flow in loop.\n");
1783 /* Different restrictions apply when we are considering an inner-most loop,
1784 vs. an outer (nested) loop.
1785 (FORNOW. May want to relax some of these restrictions in the future). */
1787 info
->inner_loop_cond
= NULL
;
1790 /* Inner-most loop. */
1792 if (empty_block_p (loop
->header
))
1793 return opt_result::failure_at (vect_location
,
1794 "not vectorized: empty loop.\n");
1798 class loop
*innerloop
= loop
->inner
;
1801 /* Nested loop. We currently require that the loop is doubly-nested,
1802 contains a single inner loop with a single exit to the block
1803 with the single exit condition in the outer loop.
1804 Vectorizable outer-loops look like this:
1816 The inner-loop also has the properties expected of inner-most loops
1817 as described above. */
1819 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1820 return opt_result::failure_at (vect_location
,
1822 " multiple nested loops.\n");
1824 entryedge
= loop_preheader_edge (innerloop
);
1825 if (entryedge
->src
!= loop
->header
1826 || !single_exit (innerloop
)
1827 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1828 return opt_result::failure_at (vect_location
,
1830 " unsupported outerloop form.\n");
1832 /* Analyze the inner-loop. */
1833 vect_loop_form_info inner
;
1834 opt_result res
= vect_analyze_loop_form (loop
->inner
, NULL
, &inner
);
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1839 "not vectorized: Bad inner loop.\n");
1843 /* Don't support analyzing niter under assumptions for inner
1845 if (!integer_onep (inner
.assumptions
))
1846 return opt_result::failure_at (vect_location
,
1847 "not vectorized: Bad inner loop.\n");
1849 if (!expr_invariant_in_loop_p (loop
, inner
.number_of_iterations
))
1850 return opt_result::failure_at (vect_location
,
1851 "not vectorized: inner-loop count not"
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE
, vect_location
,
1856 "Considering outer-loop vectorization.\n");
1857 info
->inner_loop_cond
= inner
.conds
[0];
1860 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1861 return opt_result::failure_at (vect_location
,
1863 " too many incoming edges.\n");
1865 /* We assume that the latch is empty. */
1866 basic_block latch
= loop
->latch
;
1869 if (!empty_block_p (latch
)
1870 || !gimple_seq_empty_p (phi_nodes (latch
)))
1871 return opt_result::failure_at (vect_location
,
1872 "not vectorized: latch block not "
1874 latch
= single_pred (latch
);
1876 while (single_succ_p (latch
));
1878 /* Make sure there is no abnormal exit. */
1879 auto_vec
<edge
> exits
= get_loop_exit_edges (loop
);
1880 for (edge e
: exits
)
1882 if (e
->flags
& EDGE_ABNORMAL
)
1883 return opt_result::failure_at (vect_location
,
1885 " abnormal loop exit edge.\n");
1889 = vect_get_loop_niters (loop
, exit_e
, &info
->assumptions
,
1890 &info
->number_of_iterations
,
1891 &info
->number_of_iterationsm1
);
1892 if (info
->conds
.is_empty ())
1893 return opt_result::failure_at
1895 "not vectorized: complicated exit condition.\n");
1897 /* Determine what the primary and alternate exit conds are. */
1898 for (unsigned i
= 0; i
< info
->conds
.length (); i
++)
1900 gcond
*cond
= info
->conds
[i
];
1901 if (exit_e
->src
== gimple_bb (cond
))
1902 std::swap (info
->conds
[0], info
->conds
[i
]);
1905 if (integer_zerop (info
->assumptions
)
1906 || !info
->number_of_iterations
1907 || chrec_contains_undetermined (info
->number_of_iterations
))
1908 return opt_result::failure_at
1910 "not vectorized: number of iterations cannot be computed.\n");
1912 if (integer_zerop (info
->number_of_iterations
))
1913 return opt_result::failure_at
1915 "not vectorized: number of iterations = 0.\n");
1917 if (!(tree_fits_shwi_p (info
->number_of_iterations
)
1918 && tree_to_shwi (info
->number_of_iterations
) > 0))
1920 if (dump_enabled_p ())
1922 dump_printf_loc (MSG_NOTE
, vect_location
,
1923 "Symbolic number of iterations is ");
1924 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, info
->number_of_iterations
);
1925 dump_printf (MSG_NOTE
, "\n");
1929 return opt_result::success ();
1932 /* Create a loop_vec_info for LOOP with SHARED and the
1933 vect_analyze_loop_form result. */
1936 vect_create_loop_vinfo (class loop
*loop
, vec_info_shared
*shared
,
1937 const vect_loop_form_info
*info
,
1938 loop_vec_info main_loop_info
)
1940 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1941 LOOP_VINFO_NITERSM1 (loop_vinfo
) = info
->number_of_iterationsm1
;
1942 LOOP_VINFO_NITERS (loop_vinfo
) = info
->number_of_iterations
;
1943 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = info
->number_of_iterations
;
1944 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = main_loop_info
;
1945 /* Also record the assumptions for versioning. */
1946 if (!integer_onep (info
->assumptions
) && !main_loop_info
)
1947 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = info
->assumptions
;
1949 for (gcond
*cond
: info
->conds
)
1951 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (cond
);
1952 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1953 /* Mark the statement as a condition. */
1954 STMT_VINFO_DEF_TYPE (loop_cond_info
) = vect_condition_def
;
1957 for (unsigned i
= 1; i
< info
->conds
.length (); i
++)
1958 LOOP_VINFO_LOOP_CONDS (loop_vinfo
).safe_push (info
->conds
[i
]);
1959 LOOP_VINFO_LOOP_IV_COND (loop_vinfo
) = info
->conds
[0];
1961 LOOP_VINFO_IV_EXIT (loop_vinfo
) = info
->loop_exit
;
1963 /* Check to see if we're vectorizing multiple exits. */
1964 LOOP_VINFO_EARLY_BREAKS (loop_vinfo
)
1965 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo
).is_empty ();
1967 if (info
->inner_loop_cond
)
1969 stmt_vec_info inner_loop_cond_info
1970 = loop_vinfo
->lookup_stmt (info
->inner_loop_cond
);
1971 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1972 /* If we have an estimate on the number of iterations of the inner
1973 loop use that to limit the scale for costing, otherwise use
1974 --param vect-inner-loop-cost-factor literally. */
1976 if (estimated_stmt_executions (loop
->inner
, &nit
))
1977 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
)
1978 = wi::smin (nit
, param_vect_inner_loop_cost_factor
).to_uhwi ();
1986 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1987 statements update the vectorization factor. */
1990 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1992 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1993 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1994 int nbbs
= loop
->num_nodes
;
1995 poly_uint64 vectorization_factor
;
1998 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
2000 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2001 gcc_assert (known_ne (vectorization_factor
, 0U));
2003 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
2004 vectorization factor of the loop is the unrolling factor required by
2005 the SLP instances. If that unrolling factor is 1, we say, that we
2006 perform pure SLP on loop - cross iteration parallelism is not
2008 bool only_slp_in_loop
= true;
2009 for (i
= 0; i
< nbbs
; i
++)
2011 basic_block bb
= bbs
[i
];
2012 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
2015 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
2018 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
2019 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2020 && !PURE_SLP_STMT (stmt_info
))
2021 /* STMT needs both SLP and loop-based vectorization. */
2022 only_slp_in_loop
= false;
2024 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
2027 if (is_gimple_debug (gsi_stmt (si
)))
2029 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2030 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
2031 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
2032 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
2033 && !PURE_SLP_STMT (stmt_info
))
2034 /* STMT needs both SLP and loop-based vectorization. */
2035 only_slp_in_loop
= false;
2039 if (only_slp_in_loop
)
2041 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE
, vect_location
,
2043 "Loop contains only SLP stmts\n");
2044 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_NOTE
, vect_location
,
2050 "Loop contains SLP and non-SLP stmts\n");
2051 /* Both the vectorization factor and unroll factor have the form
2052 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2053 so they must have a common multiple. */
2054 vectorization_factor
2055 = force_common_multiple (vectorization_factor
,
2056 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
2059 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
2060 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE
, vect_location
,
2063 "Updating vectorization factor to ");
2064 dump_dec (MSG_NOTE
, vectorization_factor
);
2065 dump_printf (MSG_NOTE
, ".\n");
2069 /* Return true if STMT_INFO describes a double reduction phi and if
2070 the other phi in the reduction is also relevant for vectorization.
2071 This rejects cases such as:
2074 x_1 = PHI <x_3(outer2), ...>;
2082 x_3 = PHI <x_2(inner)>;
2084 if nothing in x_2 or elsewhere makes x_1 relevant. */
2087 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
2089 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2092 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
2095 /* Function vect_analyze_loop_operations.
2097 Scan the loop stmts and make sure they are all vectorizable. */
2100 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
2102 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2103 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
2104 int nbbs
= loop
->num_nodes
;
2106 stmt_vec_info stmt_info
;
2107 bool need_to_vectorize
= false;
2110 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2112 auto_vec
<stmt_info_for_cost
> cost_vec
;
2114 for (i
= 0; i
< nbbs
; i
++)
2116 basic_block bb
= bbs
[i
];
2118 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
2121 gphi
*phi
= si
.phi ();
2124 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2125 if (dump_enabled_p ())
2126 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G",
2128 if (virtual_operand_p (gimple_phi_result (phi
)))
2131 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2132 (i.e., a phi in the tail of the outer-loop). */
2133 if (! is_loop_header_bb_p (bb
))
2135 /* FORNOW: we currently don't support the case that these phis
2136 are not used in the outerloop (unless it is double reduction,
2137 i.e., this phi is vect_reduction_def), cause this case
2138 requires to actually do something here. */
2139 if (STMT_VINFO_LIVE_P (stmt_info
)
2140 && !vect_active_double_reduction_p (stmt_info
))
2141 return opt_result::failure_at (phi
,
2142 "Unsupported loop-closed phi"
2143 " in outer-loop.\n");
2145 /* If PHI is used in the outer loop, we check that its operand
2146 is defined in the inner loop. */
2147 if (STMT_VINFO_RELEVANT_P (stmt_info
))
2151 if (gimple_phi_num_args (phi
) != 1)
2152 return opt_result::failure_at (phi
, "unsupported phi");
2154 phi_op
= PHI_ARG_DEF (phi
, 0);
2155 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
2157 return opt_result::failure_at (phi
, "unsupported phi\n");
2159 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
2160 && (STMT_VINFO_RELEVANT (op_def_info
)
2161 != vect_used_in_outer_by_reduction
))
2162 return opt_result::failure_at (phi
, "unsupported phi\n");
2164 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
2165 || (STMT_VINFO_DEF_TYPE (stmt_info
)
2166 == vect_double_reduction_def
))
2167 && !vectorizable_lc_phi (loop_vinfo
,
2168 stmt_info
, NULL
, NULL
))
2169 return opt_result::failure_at (phi
, "unsupported phi\n");
2175 gcc_assert (stmt_info
);
2177 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
2178 || STMT_VINFO_LIVE_P (stmt_info
))
2179 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
2180 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
2181 /* A scalar-dependence cycle that we don't support. */
2182 return opt_result::failure_at (phi
,
2184 " scalar dependence cycle.\n");
2186 if (STMT_VINFO_RELEVANT_P (stmt_info
))
2188 need_to_vectorize
= true;
2189 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
2190 && ! PURE_SLP_STMT (stmt_info
))
2191 ok
= vectorizable_induction (loop_vinfo
,
2192 stmt_info
, NULL
, NULL
,
2194 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2195 || (STMT_VINFO_DEF_TYPE (stmt_info
)
2196 == vect_double_reduction_def
)
2197 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
2198 && ! PURE_SLP_STMT (stmt_info
))
2199 ok
= vectorizable_reduction (loop_vinfo
,
2200 stmt_info
, NULL
, NULL
, &cost_vec
);
2201 else if ((STMT_VINFO_DEF_TYPE (stmt_info
)
2202 == vect_first_order_recurrence
)
2203 && ! PURE_SLP_STMT (stmt_info
))
2204 ok
= vectorizable_recurr (loop_vinfo
, stmt_info
, NULL
, NULL
,
2208 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2210 && STMT_VINFO_LIVE_P (stmt_info
)
2211 && !PURE_SLP_STMT (stmt_info
))
2212 ok
= vectorizable_live_operation (loop_vinfo
, stmt_info
, NULL
, NULL
,
2213 -1, false, &cost_vec
);
2216 return opt_result::failure_at (phi
,
2217 "not vectorized: relevant phi not "
2219 static_cast <gimple
*> (phi
));
2222 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
2225 gimple
*stmt
= gsi_stmt (si
);
2226 if (!gimple_clobber_p (stmt
)
2227 && !is_gimple_debug (stmt
))
2230 = vect_analyze_stmt (loop_vinfo
,
2231 loop_vinfo
->lookup_stmt (stmt
),
2233 NULL
, NULL
, &cost_vec
);
2240 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
2242 /* All operations in the loop are either irrelevant (deal with loop
2243 control, or dead), or only used outside the loop and can be moved
2244 out of the loop (e.g. invariants, inductions). The loop can be
2245 optimized away by scalar optimizations. We're better off not
2246 touching this loop. */
2247 if (!need_to_vectorize
)
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE
, vect_location
,
2251 "All the computation can be taken out of the loop.\n");
2252 return opt_result::failure_at
2254 "not vectorized: redundant loop. no profit to vectorize.\n");
2257 return opt_result::success ();
2260 /* Return true if we know that the iteration count is smaller than the
2261 vectorization factor. Return false if it isn't, or if we can't be sure
2265 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
2267 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2269 HOST_WIDE_INT max_niter
;
2270 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2271 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
2273 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
2275 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
2281 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2282 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2283 definitely no, or -1 if it's worth retrying. */
2286 vect_analyze_loop_costing (loop_vec_info loop_vinfo
,
2287 unsigned *suggested_unroll_factor
)
2289 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2290 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2292 /* Only loops that can handle partially-populated vectors can have iteration
2293 counts less than the vectorization factor. */
2294 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2295 && vect_known_niters_smaller_than_vf (loop_vinfo
))
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2299 "not vectorized: iteration count smaller than "
2300 "vectorization factor.\n");
2304 /* If we know the number of iterations we can do better, for the
2305 epilogue we can also decide whether the main loop leaves us
2306 with enough iterations, prefering a smaller vector epilog then
2307 also possibly used for the case we skip the vector loop. */
2308 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2310 widest_int scalar_niters
2311 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo
)) + 1;
2312 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2314 loop_vec_info orig_loop_vinfo
2315 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2317 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
));
2318 int prolog_peeling
= 0;
2319 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2320 prolog_peeling
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
);
2321 if (prolog_peeling
>= 0
2322 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
),
2326 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo
) ? 1 : 0;
2327 scalar_niters
= ((scalar_niters
- gap
- prolog_peeling
)
2331 /* Reject vectorizing for a single scalar iteration, even if
2332 we could in principle implement that using partial vectors. */
2333 unsigned peeling_gap
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
);
2334 if (scalar_niters
<= peeling_gap
+ 1)
2336 if (dump_enabled_p ())
2337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2338 "not vectorized: loop only has a single "
2339 "scalar iteration.\n");
2343 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2345 /* Check that the loop processes at least one full vector. */
2346 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2347 if (known_lt (scalar_niters
, vf
))
2349 if (dump_enabled_p ())
2350 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2351 "loop does not have enough iterations "
2352 "to support vectorization.\n");
2356 /* If we need to peel an extra epilogue iteration to handle data
2357 accesses with gaps, check that there are enough scalar iterations
2360 The check above is redundant with this one when peeling for gaps,
2361 but the distinction is useful for diagnostics. */
2362 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2363 && known_le (scalar_niters
, vf
))
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2367 "loop does not have enough iterations "
2368 "to support peeling for gaps.\n");
2374 /* If using the "very cheap" model. reject cases in which we'd keep
2375 a copy of the scalar code (even if we might be able to vectorize it). */
2376 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2377 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2378 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)))
2380 if (dump_enabled_p ())
2381 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2382 "some scalar iterations would need to be peeled\n");
2386 int min_profitable_iters
, min_profitable_estimate
;
2387 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
2388 &min_profitable_estimate
,
2389 suggested_unroll_factor
);
2391 if (min_profitable_iters
< 0)
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2395 "not vectorized: vectorization not profitable.\n");
2396 if (dump_enabled_p ())
2397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2398 "not vectorized: vector version will never be "
2403 int min_scalar_loop_bound
= (param_min_vect_loop_bound
2406 /* Use the cost model only if it is more conservative than user specified
2408 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
2409 min_profitable_iters
);
2411 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
2413 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2414 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2418 "not vectorized: vectorization not profitable.\n");
2419 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE
, vect_location
,
2421 "not vectorized: iteration count smaller than user "
2422 "specified loop bound parameter or minimum profitable "
2423 "iterations (whichever is more conservative).\n");
2427 /* The static profitablity threshold min_profitable_estimate includes
2428 the cost of having to check at runtime whether the scalar loop
2429 should be used instead. If it turns out that we don't need or want
2430 such a check, the threshold we should use for the static estimate
2431 is simply the point at which the vector loop becomes more profitable
2432 than the scalar loop. */
2433 if (min_profitable_estimate
> min_profitable_iters
2434 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2435 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2436 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2437 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
2439 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
2441 " choice between the scalar and vector loops\n");
2442 min_profitable_estimate
= min_profitable_iters
;
2445 /* If the vector loop needs multiple iterations to be beneficial then
2446 things are probably too close to call, and the conservative thing
2447 would be to stick with the scalar code. */
2448 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2449 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2453 "one iteration of the vector loop would be"
2454 " more expensive than the equivalent number of"
2455 " iterations of the scalar loop\n");
2459 HOST_WIDE_INT estimated_niter
;
2461 /* If we are vectorizing an epilogue then we know the maximum number of
2462 scalar iterations it will cover is at least one lower than the
2463 vectorization factor of the main loop. */
2464 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2466 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
2469 estimated_niter
= estimated_stmt_executions_int (loop
);
2470 if (estimated_niter
== -1)
2471 estimated_niter
= likely_max_stmt_executions_int (loop
);
2473 if (estimated_niter
!= -1
2474 && ((unsigned HOST_WIDE_INT
) estimated_niter
2475 < MAX (th
, (unsigned) min_profitable_estimate
)))
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2479 "not vectorized: estimated iteration count too "
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE
, vect_location
,
2483 "not vectorized: estimated iteration count smaller "
2484 "than specified loop bound parameter or minimum "
2485 "profitable iterations (whichever is more "
2486 "conservative).\n");
2494 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
2495 vec
<data_reference_p
> *datarefs
,
2496 unsigned int *n_stmts
)
2499 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
2500 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
2501 !gsi_end_p (gsi
); gsi_next (&gsi
))
2503 gimple
*stmt
= gsi_stmt (gsi
);
2504 if (is_gimple_debug (stmt
))
2507 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
2511 if (is_gimple_call (stmt
) && loop
->safelen
)
2513 tree fndecl
= gimple_call_fndecl (stmt
), op
;
2514 if (fndecl
== NULL_TREE
2515 && gimple_call_internal_p (stmt
, IFN_MASK_CALL
))
2517 fndecl
= gimple_call_arg (stmt
, 0);
2518 gcc_checking_assert (TREE_CODE (fndecl
) == ADDR_EXPR
);
2519 fndecl
= TREE_OPERAND (fndecl
, 0);
2520 gcc_checking_assert (TREE_CODE (fndecl
) == FUNCTION_DECL
);
2522 if (fndecl
!= NULL_TREE
)
2524 cgraph_node
*node
= cgraph_node::get (fndecl
);
2525 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2527 unsigned int j
, n
= gimple_call_num_args (stmt
);
2528 for (j
= 0; j
< n
; j
++)
2530 op
= gimple_call_arg (stmt
, j
);
2532 || (REFERENCE_CLASS_P (op
)
2533 && get_base_address (op
)))
2536 op
= gimple_call_lhs (stmt
);
2537 /* Ignore #pragma omp declare simd functions
2538 if they don't have data references in the
2539 call stmt itself. */
2543 || (REFERENCE_CLASS_P (op
)
2544 && get_base_address (op
)))))
2551 /* If dependence analysis will give up due to the limit on the
2552 number of datarefs stop here and fail fatally. */
2553 if (datarefs
->length ()
2554 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2555 return opt_result::failure_at (stmt
, "exceeded param "
2556 "loop-max-datarefs-for-datadeps\n");
2558 return opt_result::success ();
2561 /* Look for SLP-only access groups and turn each individual access into its own
2564 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2567 struct data_reference
*dr
;
2569 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2571 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2572 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2574 gcc_assert (DR_REF (dr
));
2575 stmt_vec_info stmt_info
2576 = vect_stmt_to_vectorize (loop_vinfo
->lookup_stmt (DR_STMT (dr
)));
2578 /* Check if the load is a part of an interleaving chain. */
2579 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2581 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2582 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_element
);
2583 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2585 /* Check if SLP-only groups. */
2586 if (!STMT_SLP_TYPE (stmt_info
)
2587 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2589 /* Dissolve the group. */
2590 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2592 stmt_vec_info vinfo
= first_element
;
2595 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2596 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2597 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2598 DR_GROUP_SIZE (vinfo
) = 1;
2599 if (STMT_VINFO_STRIDED_P (first_element
)
2600 /* We cannot handle stores with gaps. */
2601 || DR_IS_WRITE (dr_info
->dr
))
2603 STMT_VINFO_STRIDED_P (vinfo
) = true;
2604 DR_GROUP_GAP (vinfo
) = 0;
2607 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2608 /* Duplicate and adjust alignment info, it needs to
2609 be present on each group leader, see dr_misalignment. */
2610 if (vinfo
!= first_element
)
2612 dr_vec_info
*dr_info2
= STMT_VINFO_DR_INFO (vinfo
);
2613 dr_info2
->target_alignment
= dr_info
->target_alignment
;
2614 int misalignment
= dr_info
->misalignment
;
2615 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
)
2618 = (TREE_INT_CST_LOW (DR_INIT (dr_info2
->dr
))
2619 - TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
)));
2620 unsigned HOST_WIDE_INT align_c
2621 = dr_info
->target_alignment
.to_constant ();
2622 misalignment
= (misalignment
+ diff
) % align_c
;
2624 dr_info2
->misalignment
= misalignment
;
2633 /* Determine if operating on full vectors for LOOP_VINFO might leave
2634 some scalar iterations still to do. If so, decide how we should
2635 handle those scalar iterations. The possibilities are:
2637 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2640 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2641 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2642 LOOP_VINFO_PEELING_FOR_NITER == false
2644 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2645 to handle the remaining scalar iterations. In this case:
2647 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2648 LOOP_VINFO_PEELING_FOR_NITER == true
2650 There are two choices:
2652 (2a) Consider vectorizing the epilogue loop at the same VF as the
2653 main loop, but using partial vectors instead of full vectors.
2656 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2658 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2661 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2665 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
)
2667 /* Determine whether there would be any scalar iterations left over. */
2668 bool need_peeling_or_partial_vectors_p
2669 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2671 /* Decide whether to vectorize the loop with partial vectors. */
2672 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2673 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2674 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2675 && need_peeling_or_partial_vectors_p
)
2677 /* For partial-vector-usage=1, try to push the handling of partial
2678 vectors to the epilogue, with the main loop continuing to operate
2681 If we are unrolling we also do not want to use partial vectors. This
2682 is to avoid the overhead of generating multiple masks and also to
2683 avoid having to execute entire iterations of FALSE masked instructions
2684 when dealing with one or less full iterations.
2686 ??? We could then end up failing to use partial vectors if we
2687 decide to peel iterations into a prologue, and if the main loop
2688 then ends up processing fewer than VF iterations. */
2689 if ((param_vect_partial_vector_usage
== 1
2690 || loop_vinfo
->suggested_unroll_factor
> 1)
2691 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2692 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2693 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2695 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2698 if (dump_enabled_p ())
2699 dump_printf_loc (MSG_NOTE
, vect_location
,
2700 "operating on %s vectors%s.\n",
2701 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2702 ? "partial" : "full",
2703 LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2704 ? " for epilogue loop" : "");
2706 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2707 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2708 && need_peeling_or_partial_vectors_p
);
2710 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2711 analysis that we don't know whether the loop is vectorized by partial
2712 vectors (More details see tree-vect-loop-manip.cc).
2714 However, SELECT_VL vectorizaton style should only applied on partial
2715 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2716 number of elements to be process for each iteration.
2718 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2719 if it is not partial vectorized loop. */
2720 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2721 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
) = false;
2723 return opt_result::success ();
2726 /* Function vect_analyze_loop_2.
2728 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2729 analyses will record information in some members of LOOP_VINFO. FATAL
2730 indicates if some analysis meets fatal error. If one non-NULL pointer
2731 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2732 worked out suggested unroll factor, while one NULL pointer shows it's
2733 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2734 is to hold the slp decision when the suggested unroll factor is worked
2737 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
,
2738 unsigned *suggested_unroll_factor
,
2739 unsigned& slp_done_for_suggested_uf
)
2741 opt_result ok
= opt_result::success ();
2743 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2744 poly_uint64 min_vf
= 2;
2745 loop_vec_info orig_loop_vinfo
= NULL
;
2747 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2748 loop_vec_info of the first vectorized loop. */
2749 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2750 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2752 orig_loop_vinfo
= loop_vinfo
;
2753 gcc_assert (orig_loop_vinfo
);
2755 /* The first group of checks is independent of the vector size. */
2758 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2759 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2760 return opt_result::failure_at (vect_location
,
2761 "not vectorized: simd if(0)\n");
2763 /* Find all data references in the loop (which correspond to vdefs/vuses)
2764 and analyze their evolution in the loop. */
2766 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2768 /* Gather the data references and count stmts in the loop. */
2769 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2772 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2773 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2774 &LOOP_VINFO_N_STMTS (loop_vinfo
));
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2779 "not vectorized: loop contains function "
2780 "calls or data references that cannot "
2784 loop_vinfo
->shared
->save_datarefs ();
2787 loop_vinfo
->shared
->check_datarefs ();
2789 /* Analyze the data references and also adjust the minimal
2790 vectorization factor according to the loads and stores. */
2792 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2795 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2797 "bad data references.\n");
2801 /* Check if we are applying unroll factor now. */
2802 bool applying_suggested_uf
= loop_vinfo
->suggested_unroll_factor
> 1;
2803 gcc_assert (!applying_suggested_uf
|| !suggested_unroll_factor
);
2805 /* If the slp decision is false when suggested unroll factor is worked
2806 out, and we are applying suggested unroll factor, we can simply skip
2807 all slp related analyses this time. */
2808 unsigned slp
= !applying_suggested_uf
? 2 : slp_done_for_suggested_uf
;
2810 /* Classify all cross-iteration scalar data-flow cycles.
2811 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2812 vect_analyze_scalar_cycles (loop_vinfo
, slp
== 2);
2814 vect_pattern_recog (loop_vinfo
);
2816 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2818 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2819 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2821 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2824 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2826 "bad data access.\n");
2830 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2832 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2835 if (dump_enabled_p ())
2836 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2837 "unexpected pattern.\n");
2841 /* While the rest of the analysis below depends on it in some way. */
2844 /* Analyze data dependences between the data-refs in the loop
2845 and adjust the maximum vectorization factor according to
2847 FORNOW: fail at the first data dependence that we encounter. */
2849 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2852 if (dump_enabled_p ())
2853 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2854 "bad data dependence.\n");
2857 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2858 && maybe_lt (max_vf
, min_vf
))
2859 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2860 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2862 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2865 if (dump_enabled_p ())
2866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2867 "can't determine vectorization factor.\n");
2871 /* Compute the scalar iteration cost. */
2872 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2874 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2875 bool saved_can_use_partial_vectors_p
2876 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2878 /* This is the point where we can re-start analysis with SLP forced off. */
2883 /* Check the SLP opportunities in the loop, analyze and build
2885 ok
= vect_analyze_slp (loop_vinfo
, LOOP_VINFO_N_STMTS (loop_vinfo
),
2890 /* If there are any SLP instances mark them as pure_slp. */
2891 if (vect_make_slp_decision (loop_vinfo
))
2893 /* Find stmts that need to be both vectorized and SLPed. */
2894 vect_detect_hybrid_slp (loop_vinfo
);
2896 /* Update the vectorization factor based on the SLP decision. */
2897 vect_update_vf_for_slp (loop_vinfo
);
2899 /* Optimize the SLP graph with the vectorization factor fixed. */
2900 vect_optimize_slp (loop_vinfo
);
2902 /* Gather the loads reachable from the SLP graph entries. */
2903 vect_gather_slp_loads (loop_vinfo
);
2907 /* We don't expect to have to roll back to anything other than an empty
2909 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2911 /* When we arrive here with SLP disabled and we are supposed
2912 to use SLP for everything fail vectorization. */
2913 if (!slp
&& param_vect_force_slp
)
2914 return opt_result::failure_at (vect_location
,
2915 "may need non-SLP handling\n");
2917 /* Apply the suggested unrolling factor, this was determined by the backend
2918 during finish_cost the first time we ran the analyzis for this
2920 if (applying_suggested_uf
)
2921 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *= loop_vinfo
->suggested_unroll_factor
;
2923 /* Now the vectorization factor is final. */
2924 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2925 gcc_assert (known_ne (vectorization_factor
, 0U));
2927 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2929 dump_printf_loc (MSG_NOTE
, vect_location
,
2930 "vectorization_factor = ");
2931 dump_dec (MSG_NOTE
, vectorization_factor
);
2932 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2933 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2936 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2937 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2938 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2940 loop_vinfo
->vector_costs
= init_cost (loop_vinfo
, false);
2942 /* Analyze the alignment of the data-refs in the loop.
2943 Fail if a data reference is found that cannot be vectorized. */
2945 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2950 "bad data alignment.\n");
2954 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2955 It is important to call pruning after vect_analyze_data_ref_accesses,
2956 since we use grouping information gathered by interleaving analysis. */
2957 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2961 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2962 vectorization, since we do not want to add extra peeling or
2963 add versioning for alignment. */
2964 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2965 /* This pass will decide on using loop versioning and/or loop peeling in
2966 order to enhance the alignment of data references in the loop. */
2967 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2973 /* Analyze operations in the SLP instances. We can't simply
2974 remove unsupported SLP instances as this makes the above
2975 SLP kind detection invalid and might also affect the VF. */
2976 if (! vect_slp_analyze_operations (loop_vinfo
))
2978 ok
= opt_result::failure_at (vect_location
,
2979 "unsupported SLP instances\n");
2984 /* Dissolve SLP-only groups. */
2985 vect_dissolve_slp_only_groups (loop_vinfo
);
2987 /* Scan all the remaining operations in the loop that are not subject
2988 to SLP and make sure they are vectorizable. */
2989 ok
= vect_analyze_loop_operations (loop_vinfo
);
2992 if (dump_enabled_p ())
2993 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2994 "bad operation or unsupported loop bound.\n");
2998 /* For now, we don't expect to mix both masking and length approaches for one
2999 loop, disable it if both are recorded. */
3000 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3001 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
3002 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
3004 if (dump_enabled_p ())
3005 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3006 "can't vectorize a loop with partial vectors"
3007 " because we don't expect to mix different"
3008 " approaches with partial vectors for the"
3010 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3013 /* If we still have the option of using partial vectors,
3014 check whether we can generate the necessary loop controls. */
3015 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
3017 if (!LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
3019 if (!vect_verify_full_masking (loop_vinfo
)
3020 && !vect_verify_full_masking_avx512 (loop_vinfo
))
3021 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3023 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3024 if (!vect_verify_loop_lens (loop_vinfo
))
3025 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3028 /* If we're vectorizing a loop that uses length "controls" and
3029 can iterate more than once, we apply decrementing IV approach
3031 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3032 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
) == vect_partial_vectors_len
3033 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) == 0
3034 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3035 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo
),
3036 LOOP_VINFO_VECT_FACTOR (loop_vinfo
))))
3037 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
) = true;
3039 /* If a loop uses length controls and has a decrementing loop control IV,
3040 we will normally pass that IV through a MIN_EXPR to calcaluate the
3041 basis for the length controls. E.g. in a loop that processes one
3042 element per scalar iteration, the number of elements would be
3043 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3045 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3046 step, since only the final iteration of the vector loop can have
3049 However, some targets have a dedicated instruction for calculating the
3050 preferred length, given the total number of elements that still need to
3051 be processed. This is encapsulated in the SELECT_VL internal function.
3053 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3054 to determine the basis for the length controls. However, unlike the
3055 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3056 lanes inactive in any iteration of the vector loop, not just the last
3057 iteration. This SELECT_VL approach therefore requires us to use pointer
3058 IVs with variable steps.
3060 Once we've decided how many elements should be processed by one
3061 iteration of the vector loop, we need to populate the rgroup controls.
3062 If a loop has multiple rgroups, we need to make sure that those rgroups
3063 "line up" (that is, they must be consistent about which elements are
3064 active and which aren't). This is done by vect_adjust_loop_lens_control.
3066 In principle, it would be possible to use vect_adjust_loop_lens_control
3067 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3070 (1) In practice, it only makes sense to use SELECT_VL when a vector
3071 operation will be controlled directly by the result. It is not
3072 worth using SELECT_VL if it would only be the input to other
3075 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3076 pointer IV will need N updates by a variable amount (N-1 updates
3077 within the iteration and 1 update to move to the next iteration).
3079 Because of this, we prefer to use the MIN_EXPR approach whenever there
3080 is more than one length control.
3082 In addition, SELECT_VL always operates to a granularity of 1 unit.
3083 If we wanted to use it to control an SLP operation on N consecutive
3084 elements, we would need to make the SELECT_VL inputs measure scalar
3085 iterations (rather than elements) and then multiply the SELECT_VL
3086 result by N. But using SELECT_VL this way is inefficient because
3089 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3092 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3093 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3095 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3096 we will fail to gain benefits of following unroll optimizations. We prefer
3097 using the MIN_EXPR approach in this situation. */
3098 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
))
3100 tree iv_type
= LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
);
3101 if (direct_internal_fn_supported_p (IFN_SELECT_VL
, iv_type
,
3103 && LOOP_VINFO_LENS (loop_vinfo
).length () == 1
3104 && LOOP_VINFO_LENS (loop_vinfo
)[0].factor
== 1
3105 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3106 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant ()))
3107 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
) = true;
3109 /* If any of the SLP instances cover more than a single lane
3110 we cannot use .SELECT_VL at the moment, even if the number
3111 of lanes is uniform throughout the SLP graph. */
3112 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
))
3113 for (slp_instance inst
: LOOP_VINFO_SLP_INSTANCES (loop_vinfo
))
3114 if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst
)) != 1
3115 && !(SLP_INSTANCE_KIND (inst
) == slp_inst_kind_store
3116 && SLP_INSTANCE_TREE (inst
)->ldst_lanes
))
3118 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
) = false;
3123 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3124 assuming that the loop will be used as a main loop. We will redo
3125 this analysis later if we instead decide to use the loop as an
3127 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
);
3131 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3132 to be able to handle fewer than VF scalars, or needs to have a lower VF
3133 than the main loop. */
3134 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
3135 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3137 poly_uint64 unscaled_vf
3138 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
),
3139 orig_loop_vinfo
->suggested_unroll_factor
);
3140 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), unscaled_vf
))
3141 return opt_result::failure_at (vect_location
,
3142 "Vectorization factor too high for"
3143 " epilogue loop.\n");
3146 /* Check the costings of the loop make vectorizing worthwhile. */
3147 res
= vect_analyze_loop_costing (loop_vinfo
, suggested_unroll_factor
);
3150 ok
= opt_result::failure_at (vect_location
,
3151 "Loop costings may not be worthwhile.\n");
3155 return opt_result::failure_at (vect_location
,
3156 "Loop costings not worthwhile.\n");
3158 /* If an epilogue loop is required make sure we can create one. */
3159 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
3160 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
3161 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
3165 if (!vect_can_advance_ivs_p (loop_vinfo
)
3166 || !slpeel_can_duplicate_loop_p (loop
,
3167 LOOP_VINFO_IV_EXIT (loop_vinfo
),
3168 LOOP_VINFO_IV_EXIT (loop_vinfo
)))
3170 ok
= opt_result::failure_at (vect_location
,
3171 "not vectorized: can't create required "
3177 /* During peeling, we need to check if number of loop iterations is
3178 enough for both peeled prolog loop and vector loop. This check
3179 can be merged along with threshold check of loop versioning, so
3180 increase threshold for this case if necessary.
3182 If we are analyzing an epilogue we still want to check what its
3183 versioning threshold would be. If we decide to vectorize the epilogues we
3184 will want to use the lowest versioning threshold of all epilogues and main
3185 loop. This will enable us to enter a vectorized epilogue even when
3186 versioning the loop. We can't simply check whether the epilogue requires
3187 versioning though since we may have skipped some versioning checks when
3188 analyzing the epilogue. For instance, checks for alias versioning will be
3189 skipped when dealing with epilogues as we assume we already checked them
3190 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3191 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
3193 poly_uint64 niters_th
= 0;
3194 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
3196 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
3198 /* Niters for peeled prolog loop. */
3199 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3201 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
3202 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
3203 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
3206 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3209 /* Niters for at least one iteration of vectorized loop. */
3210 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
3211 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3212 /* One additional iteration because of peeling for gap. */
3213 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3216 /* Use the same condition as vect_transform_loop to decide when to use
3217 the cost to determine a versioning threshold. */
3218 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
3219 && ordered_p (th
, niters_th
))
3220 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
3222 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
3225 gcc_assert (known_eq (vectorization_factor
,
3226 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
3228 slp_done_for_suggested_uf
= slp
;
3230 /* Ok to vectorize! */
3231 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
3232 return opt_result::success ();
3235 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3238 /* Try again with SLP degraded but if we didn't do any SLP there is
3239 no point in re-trying. */
3243 /* If we are applying suggested unroll factor, we don't need to
3244 re-try any more as we want to keep the SLP mode fixed. */
3245 if (applying_suggested_uf
)
3248 /* If there are reduction chains re-trying will fail anyway. */
3249 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
3252 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3253 via interleaving or lane instructions. */
3254 slp_instance instance
;
3257 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
3259 if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
)) != vect_internal_def
)
3262 stmt_vec_info vinfo
;
3263 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
3264 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
3266 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
3267 unsigned int size
= DR_GROUP_SIZE (vinfo
);
3268 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
3269 if (vect_store_lanes_supported (vectype
, size
, false) == IFN_LAST
3270 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
3271 && ! vect_grouped_store_supported (vectype
, size
))
3272 return opt_result::failure_at (vinfo
->stmt
,
3273 "unsupported grouped store\n");
3274 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
3276 vinfo
= SLP_TREE_REPRESENTATIVE (node
);
3277 if (STMT_VINFO_GROUPED_ACCESS (vinfo
))
3279 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
3280 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
3281 size
= DR_GROUP_SIZE (vinfo
);
3282 vectype
= STMT_VINFO_VECTYPE (vinfo
);
3283 if (vect_load_lanes_supported (vectype
, size
, false) == IFN_LAST
3284 && ! vect_grouped_load_supported (vectype
, single_element_p
,
3286 return opt_result::failure_at (vinfo
->stmt
,
3287 "unsupported grouped load\n");
3292 /* Roll back state appropriately. Degrade SLP this time. From multi-
3293 to single-lane to disabled. */
3295 if (dump_enabled_p ())
3298 dump_printf_loc (MSG_NOTE
, vect_location
,
3299 "re-trying with single-lane SLP\n");
3301 dump_printf_loc (MSG_NOTE
, vect_location
,
3302 "re-trying with SLP disabled\n");
3305 /* Restore vectorization factor as it were without SLP. */
3306 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
3307 /* Free the SLP instances. */
3308 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
3309 vect_free_slp_instance (instance
);
3310 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
3311 /* Reset SLP type to loop_vect on all stmts. */
3312 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
3314 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3315 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
3316 !gsi_end_p (si
); gsi_next (&si
))
3318 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
3319 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
3320 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
3321 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
3323 /* vectorizable_reduction adjusts reduction stmt def-types,
3324 restore them to that of the PHI. */
3325 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
3326 = STMT_VINFO_DEF_TYPE (stmt_info
);
3327 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328 (STMT_VINFO_REDUC_DEF (stmt_info
)))
3329 = STMT_VINFO_DEF_TYPE (stmt_info
);
3332 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
3333 !gsi_end_p (si
); gsi_next (&si
))
3335 if (is_gimple_debug (gsi_stmt (si
)))
3337 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
3338 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
3339 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3341 stmt_vec_info pattern_stmt_info
3342 = STMT_VINFO_RELATED_STMT (stmt_info
);
3343 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
3344 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
3346 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
3347 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
3348 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
3349 !gsi_end_p (pi
); gsi_next (&pi
))
3350 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
3355 /* Free optimized alias test DDRS. */
3356 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
3357 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
3358 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
3359 /* Reset target cost data. */
3360 delete loop_vinfo
->vector_costs
;
3361 loop_vinfo
->vector_costs
= nullptr;
3362 /* Reset accumulated rgroup information. */
3363 LOOP_VINFO_MASKS (loop_vinfo
).mask_set
.empty ();
3364 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
);
3365 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
3366 /* Reset assorted flags. */
3367 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
3368 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
3369 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
3370 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
3371 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
3372 = saved_can_use_partial_vectors_p
;
3373 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
3374 if (loop_vinfo
->scan_map
)
3375 loop_vinfo
->scan_map
->empty ();
3380 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3381 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3382 OLD_LOOP_VINFO is better unless something specifically indicates
3385 Note that this deliberately isn't a partial order. */
3388 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
3389 loop_vec_info old_loop_vinfo
)
3391 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
3392 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
3394 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
3395 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
3397 /* Always prefer a VF of loop->simdlen over any other VF. */
3400 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
3401 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
3402 if (new_simdlen_p
!= old_simdlen_p
)
3403 return new_simdlen_p
;
3406 const auto *old_costs
= old_loop_vinfo
->vector_costs
;
3407 const auto *new_costs
= new_loop_vinfo
->vector_costs
;
3408 if (loop_vec_info main_loop
= LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo
))
3409 return new_costs
->better_epilogue_loop_than_p (old_costs
, main_loop
);
3411 return new_costs
->better_main_loop_than_p (old_costs
);
3414 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3415 true if we should. */
3418 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
3419 loop_vec_info old_loop_vinfo
)
3421 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
3424 if (dump_enabled_p ())
3425 dump_printf_loc (MSG_NOTE
, vect_location
,
3426 "***** Preferring vector mode %s to vector mode %s\n",
3427 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
3428 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
3432 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3433 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3434 MODE_I to the next mode useful to analyze.
3435 Return the loop_vinfo on success and wrapped null on failure. */
3437 static opt_loop_vec_info
3438 vect_analyze_loop_1 (class loop
*loop
, vec_info_shared
*shared
,
3439 const vect_loop_form_info
*loop_form_info
,
3440 loop_vec_info main_loop_vinfo
,
3441 const vector_modes
&vector_modes
, unsigned &mode_i
,
3442 machine_mode
&autodetected_vector_mode
,
3445 loop_vec_info loop_vinfo
3446 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3448 machine_mode vector_mode
= vector_modes
[mode_i
];
3449 loop_vinfo
->vector_mode
= vector_mode
;
3450 unsigned int suggested_unroll_factor
= 1;
3451 unsigned slp_done_for_suggested_uf
= 0;
3453 /* Run the main analysis. */
3454 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
,
3455 &suggested_unroll_factor
,
3456 slp_done_for_suggested_uf
);
3457 if (dump_enabled_p ())
3458 dump_printf_loc (MSG_NOTE
, vect_location
,
3459 "***** Analysis %s with vector mode %s\n",
3460 res
? "succeeded" : "failed",
3461 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3463 if (res
&& !main_loop_vinfo
&& suggested_unroll_factor
> 1)
3465 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_NOTE
, vect_location
,
3467 "***** Re-trying analysis for unrolling"
3468 " with unroll factor %d and slp %s.\n",
3469 suggested_unroll_factor
,
3470 slp_done_for_suggested_uf
? "on" : "off");
3471 loop_vec_info unroll_vinfo
3472 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3473 unroll_vinfo
->vector_mode
= vector_mode
;
3474 unroll_vinfo
->suggested_unroll_factor
= suggested_unroll_factor
;
3475 opt_result new_res
= vect_analyze_loop_2 (unroll_vinfo
, fatal
, NULL
,
3476 slp_done_for_suggested_uf
);
3480 loop_vinfo
= unroll_vinfo
;
3483 delete unroll_vinfo
;
3486 /* Remember the autodetected vector mode. */
3487 if (vector_mode
== VOIDmode
)
3488 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
3490 /* Advance mode_i, first skipping modes that would result in the
3491 same analysis result. */
3492 while (mode_i
+ 1 < vector_modes
.length ()
3493 && vect_chooses_same_modes_p (loop_vinfo
,
3494 vector_modes
[mode_i
+ 1]))
3496 if (dump_enabled_p ())
3497 dump_printf_loc (MSG_NOTE
, vect_location
,
3498 "***** The result for vector mode %s would"
3500 GET_MODE_NAME (vector_modes
[mode_i
+ 1]));
3503 if (mode_i
+ 1 < vector_modes
.length ()
3504 && VECTOR_MODE_P (autodetected_vector_mode
)
3505 && (related_vector_mode (vector_modes
[mode_i
+ 1],
3506 GET_MODE_INNER (autodetected_vector_mode
))
3507 == autodetected_vector_mode
)
3508 && (related_vector_mode (autodetected_vector_mode
,
3509 GET_MODE_INNER (vector_modes
[mode_i
+ 1]))
3510 == vector_modes
[mode_i
+ 1]))
3512 if (dump_enabled_p ())
3513 dump_printf_loc (MSG_NOTE
, vect_location
,
3514 "***** Skipping vector mode %s, which would"
3515 " repeat the analysis for %s\n",
3516 GET_MODE_NAME (vector_modes
[mode_i
+ 1]),
3517 GET_MODE_NAME (autodetected_vector_mode
));
3526 gcc_checking_assert (main_loop_vinfo
== NULL
);
3527 return opt_loop_vec_info::propagate_failure (res
);
3530 return opt_loop_vec_info::success (loop_vinfo
);
3533 /* Function vect_analyze_loop.
3535 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3536 for it. The different analyses will record information in the
3537 loop_vec_info struct. */
3539 vect_analyze_loop (class loop
*loop
, gimple
*loop_vectorized_call
,
3540 vec_info_shared
*shared
)
3542 DUMP_VECT_SCOPE ("analyze_loop_nest");
3544 if (loop_outer (loop
)
3545 && loop_vec_info_for_loop (loop_outer (loop
))
3546 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
3547 return opt_loop_vec_info::failure_at (vect_location
,
3548 "outer-loop already vectorized.\n");
3550 if (!find_loop_nest (loop
, &shared
->loop_nest
))
3551 return opt_loop_vec_info::failure_at
3553 "not vectorized: loop nest containing two or more consecutive inner"
3554 " loops cannot be vectorized\n");
3556 /* Analyze the loop form. */
3557 vect_loop_form_info loop_form_info
;
3558 opt_result res
= vect_analyze_loop_form (loop
, loop_vectorized_call
,
3562 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3564 "bad loop form.\n");
3565 return opt_loop_vec_info::propagate_failure (res
);
3567 if (!integer_onep (loop_form_info
.assumptions
))
3569 /* We consider to vectorize this loop by versioning it under
3570 some assumptions. In order to do this, we need to clear
3571 existing information computed by scev and niter analyzer. */
3573 free_numbers_of_iterations_estimates (loop
);
3574 /* Also set flag for this loop so that following scev and niter
3575 analysis are done under the assumptions. */
3576 loop_constraint_set (loop
, LOOP_C_FINITE
);
3579 /* Clear the existing niter information to make sure the nonwrapping flag
3580 will be calculated and set propriately. */
3581 free_numbers_of_iterations_estimates (loop
);
3583 auto_vector_modes vector_modes
;
3584 /* Autodetect first vector size we try. */
3585 vector_modes
.safe_push (VOIDmode
);
3586 unsigned int autovec_flags
3587 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
3588 loop
->simdlen
!= 0);
3589 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
3590 && !unlimited_cost_model (loop
));
3591 machine_mode autodetected_vector_mode
= VOIDmode
;
3592 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3593 unsigned int mode_i
= 0;
3594 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
3596 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3597 a mode has not been analyzed. */
3598 auto_vec
<poly_uint64
, 8> cached_vf_per_mode
;
3599 for (unsigned i
= 0; i
< vector_modes
.length (); ++i
)
3600 cached_vf_per_mode
.safe_push (0);
3602 /* First determine the main loop vectorization mode, either the first
3603 one that works, starting with auto-detecting the vector mode and then
3604 following the targets order of preference, or the one with the
3605 lowest cost if pick_lowest_cost_p. */
3609 unsigned int last_mode_i
= mode_i
;
3610 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3612 cached_vf_per_mode
[last_mode_i
] = -1;
3613 opt_loop_vec_info loop_vinfo
3614 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3615 NULL
, vector_modes
, mode_i
,
3616 autodetected_vector_mode
, fatal
);
3622 /* Analyzis has been successful so update the VF value. The
3623 VF should always be a multiple of unroll_factor and we want to
3624 capture the original VF here. */
3625 cached_vf_per_mode
[last_mode_i
]
3626 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
3627 loop_vinfo
->suggested_unroll_factor
);
3628 /* Once we hit the desired simdlen for the first time,
3629 discard any previous attempts. */
3631 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3633 delete first_loop_vinfo
;
3634 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3637 else if (pick_lowest_cost_p
3639 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3641 /* Pick loop_vinfo over first_loop_vinfo. */
3642 delete first_loop_vinfo
;
3643 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3645 if (first_loop_vinfo
== NULL
)
3646 first_loop_vinfo
= loop_vinfo
;
3650 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3653 /* Commit to first_loop_vinfo if we have no reason to try
3655 if (!simdlen
&& !pick_lowest_cost_p
)
3658 if (mode_i
== vector_modes
.length ()
3659 || autodetected_vector_mode
== VOIDmode
)
3662 /* Try the next biggest vector size. */
3663 if (dump_enabled_p ())
3664 dump_printf_loc (MSG_NOTE
, vect_location
,
3665 "***** Re-trying analysis with vector mode %s\n",
3666 GET_MODE_NAME (vector_modes
[mode_i
]));
3668 if (!first_loop_vinfo
)
3669 return opt_loop_vec_info::propagate_failure (res
);
3671 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_NOTE
, vect_location
,
3673 "***** Choosing vector mode %s\n",
3674 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3676 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3677 enabled, SIMDUID is not set, it is the innermost loop and we have
3678 either already found the loop's SIMDLEN or there was no SIMDLEN to
3680 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3681 bool vect_epilogues
= (!simdlen
3682 && loop
->inner
== NULL
3683 && param_vect_epilogues_nomask
3684 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3685 /* No code motion support for multiple epilogues so for now
3686 not supported when multiple exits. */
3687 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo
)
3689 && loop_cost_model (loop
) > VECT_COST_MODEL_VERY_CHEAP
);
3690 if (!vect_epilogues
)
3691 return first_loop_vinfo
;
3693 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3694 poly_uint64 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3696 /* For epilogues start the analysis from the first mode. The motivation
3697 behind starting from the beginning comes from cases where the VECTOR_MODES
3698 array may contain length-agnostic and length-specific modes. Their
3699 ordering is not guaranteed, so we could end up picking a mode for the main
3700 loop that is after the epilogue's optimal mode. */
3701 vector_modes
[0] = autodetected_vector_mode
;
3704 bool supports_partial_vectors
=
3705 partial_vectors_supported_p () && param_vect_partial_vector_usage
!= 0;
3706 poly_uint64 first_vinfo_vf
= LOOP_VINFO_VECT_FACTOR (first_loop_vinfo
);
3710 /* If the target does not support partial vectors we can shorten the
3711 number of modes to analyze for the epilogue as we know we can't pick a
3712 mode that would lead to a VF at least as big as the
3714 if (!supports_partial_vectors
3715 && maybe_ge (cached_vf_per_mode
[mode_i
], first_vinfo_vf
))
3718 if (mode_i
== vector_modes
.length ())
3723 if (dump_enabled_p ())
3724 dump_printf_loc (MSG_NOTE
, vect_location
,
3725 "***** Re-trying epilogue analysis with vector "
3726 "mode %s\n", GET_MODE_NAME (vector_modes
[mode_i
]));
3729 opt_loop_vec_info loop_vinfo
3730 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3732 vector_modes
, mode_i
,
3733 autodetected_vector_mode
, fatal
);
3739 if (pick_lowest_cost_p
)
3741 /* Keep trying to roll back vectorization attempts while the
3742 loop_vec_infos they produced were worse than this one. */
3743 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3744 while (!vinfos
.is_empty ()
3745 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3747 gcc_assert (vect_epilogues
);
3748 delete vinfos
.pop ();
3751 /* For now only allow one epilogue loop. */
3752 if (first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3754 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3755 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3756 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3757 || maybe_ne (lowest_th
, 0U));
3758 /* Keep track of the known smallest versioning
3760 if (ordered_p (lowest_th
, th
))
3761 lowest_th
= ordered_min (lowest_th
, th
);
3766 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3769 /* For now only allow one epilogue loop, but allow
3770 pick_lowest_cost_p to replace it, so commit to the
3771 first epilogue if we have no reason to try alternatives. */
3772 if (!pick_lowest_cost_p
)
3776 if (mode_i
== vector_modes
.length ())
3781 if (!first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3783 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3784 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE
, vect_location
,
3786 "***** Choosing epilogue vector mode %s\n",
3788 (first_loop_vinfo
->epilogue_vinfos
[0]->vector_mode
));
3791 return first_loop_vinfo
;
3794 /* Return true if there is an in-order reduction function for CODE, storing
3795 it in *REDUC_FN if so. */
3798 fold_left_reduction_fn (code_helper code
, internal_fn
*reduc_fn
)
3800 /* We support MINUS_EXPR by negating the operand. This also preserves an
3801 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3803 if (code
== PLUS_EXPR
|| code
== MINUS_EXPR
)
3805 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3811 /* Function reduction_fn_for_scalar_code
3814 CODE - tree_code of a reduction operations.
3817 REDUC_FN - the corresponding internal function to be used to reduce the
3818 vector of partial results into a single scalar result, or IFN_LAST
3819 if the operation is a supported reduction operation, but does not have
3820 such an internal function.
3822 Return FALSE if CODE currently cannot be vectorized as reduction. */
3825 reduction_fn_for_scalar_code (code_helper code
, internal_fn
*reduc_fn
)
3827 if (code
.is_tree_code ())
3828 switch (tree_code (code
))
3831 *reduc_fn
= IFN_REDUC_MAX
;
3835 *reduc_fn
= IFN_REDUC_MIN
;
3839 *reduc_fn
= IFN_REDUC_PLUS
;
3843 *reduc_fn
= IFN_REDUC_AND
;
3847 *reduc_fn
= IFN_REDUC_IOR
;
3851 *reduc_fn
= IFN_REDUC_XOR
;
3856 *reduc_fn
= IFN_LAST
;
3863 switch (combined_fn (code
))
3866 *reduc_fn
= IFN_REDUC_FMAX
;
3870 *reduc_fn
= IFN_REDUC_FMIN
;
3878 /* If there is a neutral value X such that a reduction would not be affected
3879 by the introduction of additional X elements, return that X, otherwise
3880 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3881 of the scalar elements. If the reduction has just a single initial value
3882 then INITIAL_VALUE is that value, otherwise it is null.
3883 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3884 In that case no signed zero is returned. */
3887 neutral_op_for_reduction (tree scalar_type
, code_helper code
,
3888 tree initial_value
, bool as_initial
)
3890 if (code
.is_tree_code ())
3891 switch (tree_code (code
))
3898 return build_zero_cst (scalar_type
);
3899 case WIDEN_SUM_EXPR
:
3901 if (!as_initial
&& HONOR_SIGNED_ZEROS (scalar_type
))
3902 return build_real (scalar_type
, dconstm0
);
3904 return build_zero_cst (scalar_type
);
3907 return build_one_cst (scalar_type
);
3910 return build_all_ones_cst (scalar_type
);
3914 return initial_value
;
3920 switch (combined_fn (code
))
3924 return initial_value
;
3931 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3932 STMT is printed with a message MSG. */
3935 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3937 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3940 /* Return true if we need an in-order reduction for operation CODE
3941 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3942 overflow must wrap. */
3945 needs_fold_left_reduction_p (tree type
, code_helper code
)
3947 /* CHECKME: check for !flag_finite_math_only too? */
3948 if (SCALAR_FLOAT_TYPE_P (type
))
3950 if (code
.is_tree_code ())
3951 switch (tree_code (code
))
3958 return !flag_associative_math
;
3961 switch (combined_fn (code
))
3968 return !flag_associative_math
;
3972 if (INTEGRAL_TYPE_P (type
))
3973 return (!code
.is_tree_code ()
3974 || !operation_no_trapping_overflow (type
, tree_code (code
)));
3976 if (SAT_FIXED_POINT_TYPE_P (type
))
3982 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3983 has a handled computation expression. Store the main reduction
3984 operation in *CODE. */
3987 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3988 tree loop_arg
, code_helper
*code
,
3989 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3991 auto_bitmap visited
;
3992 tree lookfor
= PHI_RESULT (phi
);
3994 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3995 while (USE_FROM_PTR (curr
) != loop_arg
)
3996 curr
= op_iter_next_use (&curri
);
3997 curri
.i
= curri
.numops
;
4000 path
.safe_push (std::make_pair (curri
, curr
));
4001 tree use
= USE_FROM_PTR (curr
);
4004 gimple
*def
= SSA_NAME_DEF_STMT (use
);
4005 if (gimple_nop_p (def
)
4006 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
4011 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
4015 curr
= op_iter_next_use (&curri
);
4016 /* Skip already visited or non-SSA operands (from iterating
4018 while (curr
!= NULL_USE_OPERAND_P
4019 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
4020 || ! bitmap_set_bit (visited
,
4022 (USE_FROM_PTR (curr
)))));
4024 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
4025 if (curr
== NULL_USE_OPERAND_P
)
4030 if (gimple_code (def
) == GIMPLE_PHI
)
4031 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
4033 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
4034 while (curr
!= NULL_USE_OPERAND_P
4035 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
4036 || ! bitmap_set_bit (visited
,
4038 (USE_FROM_PTR (curr
)))))
4039 curr
= op_iter_next_use (&curri
);
4040 if (curr
== NULL_USE_OPERAND_P
)
4045 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
4047 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
4049 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
4050 FOR_EACH_VEC_ELT (path
, i
, x
)
4051 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
4052 dump_printf (MSG_NOTE
, "\n");
4055 /* Check whether the reduction path detected is valid. */
4056 bool fail
= path
.length () == 0;
4060 for (unsigned i
= 1; i
< path
.length (); ++i
)
4062 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
4064 if (!gimple_extract_op (use_stmt
, &op
))
4069 unsigned int opi
= op
.num_ops
;
4070 if (gassign
*assign
= dyn_cast
<gassign
*> (use_stmt
))
4072 /* The following make sure we can compute the operand index
4073 easily plus it mostly disallows chaining via COND_EXPR condition
4075 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
4076 if (gimple_assign_rhs1_ptr (assign
) + opi
== path
[i
].second
->use
)
4079 else if (gcall
*call
= dyn_cast
<gcall
*> (use_stmt
))
4081 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
4082 if (gimple_call_arg_ptr (call
, opi
) == path
[i
].second
->use
)
4085 if (opi
== op
.num_ops
)
4090 op
.code
= canonicalize_code (op
.code
, op
.type
);
4091 if (op
.code
== MINUS_EXPR
)
4093 op
.code
= PLUS_EXPR
;
4094 /* Track whether we negate the reduction value each iteration. */
4095 if (op
.ops
[1] == op
.ops
[opi
])
4098 else if (op
.code
== IFN_COND_SUB
)
4100 op
.code
= IFN_COND_ADD
;
4101 /* Track whether we negate the reduction value each iteration. */
4102 if (op
.ops
[2] == op
.ops
[opi
])
4105 if (CONVERT_EXPR_CODE_P (op
.code
)
4106 && tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
4108 else if (*code
== ERROR_MARK
)
4111 sign
= TYPE_SIGN (op
.type
);
4113 else if (op
.code
!= *code
)
4118 else if ((op
.code
== MIN_EXPR
4119 || op
.code
== MAX_EXPR
)
4120 && sign
!= TYPE_SIGN (op
.type
))
4125 /* Check there's only a single stmt the op is used on. For the
4126 not value-changing tail and the last stmt allow out-of-loop uses.
4127 ??? We could relax this and handle arbitrary live stmts by
4128 forcing a scalar epilogue for example. */
4129 imm_use_iterator imm_iter
;
4130 use_operand_p use_p
;
4131 gimple
*op_use_stmt
;
4133 bool cond_fn_p
= op
.code
.is_internal_fn ()
4134 && (conditional_internal_fn_code (internal_fn (op
.code
))
4137 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
.ops
[opi
])
4139 /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4140 have op1 twice (once as definition, once as else) in the same
4141 operation. Enforce this. */
4142 if (cond_fn_p
&& op_use_stmt
== use_stmt
)
4144 gcall
*call
= as_a
<gcall
*> (use_stmt
);
4146 = internal_fn_else_index (internal_fn (op
.code
));
4147 if (gimple_call_arg (call
, else_pos
) != op
.ops
[opi
])
4152 for (unsigned int j
= 0; j
< gimple_call_num_args (call
); ++j
)
4156 if (gimple_call_arg (call
, j
) == op
.ops
[opi
])
4160 else if (!is_gimple_debug (op_use_stmt
)
4161 && (*code
!= ERROR_MARK
4162 || flow_bb_inside_loop_p (loop
,
4163 gimple_bb (op_use_stmt
))))
4164 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
4174 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
4178 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
4179 tree loop_arg
, enum tree_code code
)
4181 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
4183 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
4189 /* Function vect_is_simple_reduction
4191 (1) Detect a cross-iteration def-use cycle that represents a simple
4192 reduction computation. We look for the following pattern:
4197 a2 = operation (a3, a1)
4204 a2 = operation (a3, a1)
4207 1. operation is commutative and associative and it is safe to
4208 change the order of the computation
4209 2. no uses for a2 in the loop (a2 is used out of the loop)
4210 3. no uses of a1 in the loop besides the reduction operation
4211 4. no uses of a1 outside the loop.
4213 Conditions 1,4 are tested here.
4214 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4216 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4219 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4223 inner loop (def of a3)
4226 (4) Detect condition expressions, ie:
4227 for (int i = 0; i < N; i++)
4233 static stmt_vec_info
4234 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
4235 bool *double_reduc
, bool *reduc_chain_p
, bool slp
)
4237 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
4238 gimple
*phi_use_stmt
= NULL
;
4239 imm_use_iterator imm_iter
;
4240 use_operand_p use_p
;
4242 *double_reduc
= false;
4243 *reduc_chain_p
= false;
4244 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
4246 tree phi_name
= PHI_RESULT (phi
);
4247 /* ??? If there are no uses of the PHI result the inner loop reduction
4248 won't be detected as possibly double-reduction by vectorizable_reduction
4249 because that tries to walk the PHI arg from the preheader edge which
4250 can be constant. See PR60382. */
4251 if (has_zero_uses (phi_name
))
4253 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
4254 unsigned nphi_def_loop_uses
= 0;
4255 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
4257 gimple
*use_stmt
= USE_STMT (use_p
);
4258 if (is_gimple_debug (use_stmt
))
4261 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
4263 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4265 "intermediate value used outside loop.\n");
4270 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4271 op1 twice (once as definition, once as else) in the same operation.
4272 Only count it as one. */
4273 if (use_stmt
!= phi_use_stmt
)
4275 nphi_def_loop_uses
++;
4276 phi_use_stmt
= use_stmt
;
4280 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
4281 if (TREE_CODE (latch_def
) != SSA_NAME
)
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4285 "reduction: not ssa_name: %T\n", latch_def
);
4289 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
4291 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
4294 bool nested_in_vect_loop
4295 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
4296 unsigned nlatch_def_loop_uses
= 0;
4297 auto_vec
<gphi
*, 3> lcphis
;
4298 bool inner_loop_of_double_reduc
= false;
4299 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
4301 gimple
*use_stmt
= USE_STMT (use_p
);
4302 if (is_gimple_debug (use_stmt
))
4304 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
4305 nlatch_def_loop_uses
++;
4308 /* We can have more than one loop-closed PHI. */
4309 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
4310 if (nested_in_vect_loop
4311 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
4312 == vect_double_reduction_def
))
4313 inner_loop_of_double_reduc
= true;
4317 /* If we are vectorizing an inner reduction we are executing that
4318 in the original order only in case we are not dealing with a
4319 double reduction. */
4320 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
4322 if (dump_enabled_p ())
4323 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
4324 "detected nested cycle: ");
4325 return def_stmt_info
;
4328 /* When the inner loop of a double reduction ends up with more than
4329 one loop-closed PHI we have failed to classify alternate such
4330 PHIs as double reduction, leading to wrong code. See PR103237. */
4331 if (inner_loop_of_double_reduc
&& lcphis
.length () != 1)
4333 if (dump_enabled_p ())
4334 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4335 "unhandle double reduction\n");
4339 /* If this isn't a nested cycle or if the nested cycle reduction value
4340 is used ouside of the inner loop we cannot handle uses of the reduction
4342 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
4344 if (dump_enabled_p ())
4345 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4346 "reduction used in loop.\n");
4350 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4351 defined in the inner loop. */
4352 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
4354 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
4355 if (gimple_phi_num_args (def_stmt
) != 1
4356 || TREE_CODE (op1
) != SSA_NAME
)
4358 if (dump_enabled_p ())
4359 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4360 "unsupported phi node definition.\n");
4365 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4366 and the latch definition op1. */
4367 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
4368 if (gimple_bb (def1
)
4369 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
4371 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
4372 && (is_gimple_assign (def1
) || is_gimple_call (def1
))
4373 && is_a
<gphi
*> (phi_use_stmt
)
4374 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
))
4375 && (op1
== PHI_ARG_DEF_FROM_EDGE (phi_use_stmt
,
4376 loop_latch_edge (loop
->inner
)))
4377 && lcphis
.length () == 1)
4379 if (dump_enabled_p ())
4380 report_vect_op (MSG_NOTE
, def_stmt
,
4381 "detected double reduction: ");
4383 *double_reduc
= true;
4384 return def_stmt_info
;
4390 /* Look for the expression computing latch_def from then loop PHI result. */
4391 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
4393 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
4396 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
4397 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
4398 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
4400 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4401 reduction chain for which the additional restriction is that
4402 all operations in the chain are the same. */
4403 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
4405 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
4406 for (i
= path
.length () - 1; i
>= 1; --i
)
4408 gimple
*stmt
= USE_STMT (path
[i
].second
);
4409 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
4411 if (!gimple_extract_op (stmt
, &op
))
4413 if (gassign
*assign
= dyn_cast
<gassign
*> (stmt
))
4414 STMT_VINFO_REDUC_IDX (stmt_info
)
4415 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (assign
);
4418 gcall
*call
= as_a
<gcall
*> (stmt
);
4419 STMT_VINFO_REDUC_IDX (stmt_info
)
4420 = path
[i
].second
->use
- gimple_call_arg_ptr (call
, 0);
4422 bool leading_conversion
= (CONVERT_EXPR_CODE_P (op
.code
)
4423 && (i
== 1 || i
== path
.length () - 1));
4424 if ((op
.code
!= code
&& !leading_conversion
)
4425 /* We can only handle the final value in epilogue
4426 generation for reduction chains. */
4427 || (i
!= 1 && !has_single_use (gimple_get_lhs (stmt
))))
4428 is_slp_reduc
= false;
4429 /* For reduction chains we support a trailing/leading
4430 conversions. We do not store those in the actual chain. */
4431 if (leading_conversion
)
4433 reduc_chain
.safe_push (stmt_info
);
4435 if (slp
&& is_slp_reduc
&& reduc_chain
.length () > 1)
4437 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
4439 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
4440 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
4442 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
4443 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
4445 /* Save the chain for further analysis in SLP detection. */
4446 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
4447 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
4449 *reduc_chain_p
= true;
4450 if (dump_enabled_p ())
4451 dump_printf_loc (MSG_NOTE
, vect_location
,
4452 "reduction: detected reduction chain\n");
4454 else if (dump_enabled_p ())
4455 dump_printf_loc (MSG_NOTE
, vect_location
,
4456 "reduction: detected reduction\n");
4458 return def_stmt_info
;
4461 if (dump_enabled_p ())
4462 dump_printf_loc (MSG_NOTE
, vect_location
,
4463 "reduction: unknown pattern\n");
4468 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4469 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4470 or -1 if not known. */
4473 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
4475 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4476 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
4478 if (dump_enabled_p ())
4479 dump_printf_loc (MSG_NOTE
, vect_location
,
4480 "cost model: epilogue peel iters set to vf/2 "
4481 "because loop iterations are unknown .\n");
4482 return assumed_vf
/ 2;
4486 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
4487 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
4488 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
4489 /* If we need to peel for gaps, but no peeling is required, we have to
4490 peel VF iterations. */
4491 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
4492 peel_iters_epilogue
= assumed_vf
;
4493 return peel_iters_epilogue
;
4497 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4499 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
4500 int *peel_iters_epilogue
,
4501 stmt_vector_for_cost
*scalar_cost_vec
,
4502 stmt_vector_for_cost
*prologue_cost_vec
,
4503 stmt_vector_for_cost
*epilogue_cost_vec
)
4507 *peel_iters_epilogue
4508 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
4510 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
4512 /* If peeled iterations are known but number of scalar loop
4513 iterations are unknown, count a taken branch per peeled loop. */
4514 if (peel_iters_prologue
> 0)
4515 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
4517 if (*peel_iters_epilogue
> 0)
4518 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
4522 stmt_info_for_cost
*si
;
4524 if (peel_iters_prologue
)
4525 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4526 retval
+= record_stmt_cost (prologue_cost_vec
,
4527 si
->count
* peel_iters_prologue
,
4528 si
->kind
, si
->stmt_info
, si
->misalign
,
4530 if (*peel_iters_epilogue
)
4531 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4532 retval
+= record_stmt_cost (epilogue_cost_vec
,
4533 si
->count
* *peel_iters_epilogue
,
4534 si
->kind
, si
->stmt_info
, si
->misalign
,
4540 /* Function vect_estimate_min_profitable_iters
4542 Return the number of iterations required for the vector version of the
4543 loop to be profitable relative to the cost of the scalar version of the
4546 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4547 of iterations for vectorization. -1 value means loop vectorization
4548 is not profitable. This returned value may be used for dynamic
4549 profitability check.
4551 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4552 for static check against estimated number of iterations. */
4555 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
4556 int *ret_min_profitable_niters
,
4557 int *ret_min_profitable_estimate
,
4558 unsigned *suggested_unroll_factor
)
4560 int min_profitable_iters
;
4561 int min_profitable_estimate
;
4562 int peel_iters_prologue
;
4563 int peel_iters_epilogue
;
4564 unsigned vec_inside_cost
= 0;
4565 int vec_outside_cost
= 0;
4566 unsigned vec_prologue_cost
= 0;
4567 unsigned vec_epilogue_cost
= 0;
4568 int scalar_single_iter_cost
= 0;
4569 int scalar_outside_cost
= 0;
4570 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4571 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
4572 vector_costs
*target_cost_data
= loop_vinfo
->vector_costs
;
4574 /* Cost model disabled. */
4575 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
4577 if (dump_enabled_p ())
4578 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
4579 *ret_min_profitable_niters
= 0;
4580 *ret_min_profitable_estimate
= 0;
4584 /* Requires loop versioning tests to handle misalignment. */
4585 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
4587 /* FIXME: Make cost depend on complexity of individual check. */
4588 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
4589 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4590 if (dump_enabled_p ())
4591 dump_printf (MSG_NOTE
,
4592 "cost model: Adding cost of checks for loop "
4593 "versioning to treat misalignment.\n");
4596 /* Requires loop versioning with alias checks. */
4597 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
4599 /* FIXME: Make cost depend on complexity of individual check. */
4600 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
4601 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4602 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
4604 /* Count LEN - 1 ANDs and LEN comparisons. */
4605 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1,
4606 scalar_stmt
, vect_prologue
);
4607 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
4610 /* Count LEN - 1 ANDs and LEN comparisons. */
4611 unsigned int nstmts
= len
* 2 - 1;
4612 /* +1 for each bias that needs adding. */
4613 for (unsigned int i
= 0; i
< len
; ++i
)
4614 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
4616 (void) add_stmt_cost (target_cost_data
, nstmts
,
4617 scalar_stmt
, vect_prologue
);
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_NOTE
,
4621 "cost model: Adding cost of checks for loop "
4622 "versioning aliasing.\n");
4625 /* Requires loop versioning with niter checks. */
4626 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
4628 /* FIXME: Make cost depend on complexity of individual check. */
4629 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
,
4630 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4631 if (dump_enabled_p ())
4632 dump_printf (MSG_NOTE
,
4633 "cost model: Adding cost of checks for loop "
4634 "versioning niters.\n");
4637 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4638 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4641 /* Count statements in scalar loop. Using this as scalar cost for a single
4644 TODO: Add outer loop support.
4646 TODO: Consider assigning different costs to different scalar
4649 scalar_single_iter_cost
= loop_vinfo
->scalar_costs
->total_cost ();
4651 /* Add additional cost for the peeled instructions in prologue and epilogue
4652 loop. (For fully-masked loops there will be no peeling.)
4654 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4655 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4657 TODO: Build an expression that represents peel_iters for prologue and
4658 epilogue to be used in a run-time test. */
4660 bool prologue_need_br_taken_cost
= false;
4661 bool prologue_need_br_not_taken_cost
= false;
4663 /* Calculate peel_iters_prologue. */
4664 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
4665 peel_iters_prologue
= 0;
4668 peel_iters_prologue
= assumed_vf
/ 2;
4669 if (dump_enabled_p ())
4670 dump_printf (MSG_NOTE
, "cost model: "
4671 "prologue peel iters set to vf/2.\n");
4673 /* If peeled iterations are unknown, count a taken branch and a not taken
4674 branch per peeled loop. Even if scalar loop iterations are known,
4675 vector iterations are not known since peeled prologue iterations are
4676 not known. Hence guards remain the same. */
4677 prologue_need_br_taken_cost
= true;
4678 prologue_need_br_not_taken_cost
= true;
4682 peel_iters_prologue
= npeel
;
4683 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
4684 /* If peeled iterations are known but number of scalar loop
4685 iterations are unknown, count a taken branch per peeled loop. */
4686 prologue_need_br_taken_cost
= true;
4689 bool epilogue_need_br_taken_cost
= false;
4690 bool epilogue_need_br_not_taken_cost
= false;
4692 /* Calculate peel_iters_epilogue. */
4693 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4694 /* We need to peel exactly one iteration for gaps. */
4695 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
4698 /* If peeling for alignment is unknown, loop bound of main loop
4700 peel_iters_epilogue
= assumed_vf
/ 2;
4701 if (dump_enabled_p ())
4702 dump_printf (MSG_NOTE
, "cost model: "
4703 "epilogue peel iters set to vf/2 because "
4704 "peeling for alignment is unknown.\n");
4706 /* See the same reason above in peel_iters_prologue calculation. */
4707 epilogue_need_br_taken_cost
= true;
4708 epilogue_need_br_not_taken_cost
= true;
4712 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4713 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4714 /* If peeled iterations are known but number of scalar loop
4715 iterations are unknown, count a taken branch per peeled loop. */
4716 epilogue_need_br_taken_cost
= true;
4719 stmt_info_for_cost
*si
;
4721 /* Add costs associated with peel_iters_prologue. */
4722 if (peel_iters_prologue
)
4723 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4725 (void) add_stmt_cost (target_cost_data
,
4726 si
->count
* peel_iters_prologue
, si
->kind
,
4727 si
->stmt_info
, si
->node
, si
->vectype
,
4728 si
->misalign
, vect_prologue
);
4731 /* Add costs associated with peel_iters_epilogue. */
4732 if (peel_iters_epilogue
)
4733 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4735 (void) add_stmt_cost (target_cost_data
,
4736 si
->count
* peel_iters_epilogue
, si
->kind
,
4737 si
->stmt_info
, si
->node
, si
->vectype
,
4738 si
->misalign
, vect_epilogue
);
4741 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4743 if (prologue_need_br_taken_cost
)
4744 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4747 if (prologue_need_br_not_taken_cost
)
4748 (void) add_stmt_cost (target_cost_data
, 1,
4749 cond_branch_not_taken
, vect_prologue
);
4751 if (epilogue_need_br_taken_cost
)
4752 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4755 if (epilogue_need_br_not_taken_cost
)
4756 (void) add_stmt_cost (target_cost_data
, 1,
4757 cond_branch_not_taken
, vect_epilogue
);
4759 /* Take care of special costs for rgroup controls of partial vectors. */
4760 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
4761 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
4762 == vect_partial_vectors_avx512
))
4764 /* Calculate how many masks we need to generate. */
4765 unsigned int num_masks
= 0;
4766 bool need_saturation
= false;
4767 for (auto rgm
: LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
)
4770 unsigned nvectors
= rgm
.factor
;
4771 num_masks
+= nvectors
;
4772 if (TYPE_PRECISION (TREE_TYPE (rgm
.compare_type
))
4773 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
)))
4774 need_saturation
= true;
4777 /* ??? The target isn't able to identify the costs below as
4778 producing masks so it cannot penaltize cases where we'd run
4779 out of mask registers for example. */
4781 /* ??? We are also failing to account for smaller vector masks
4782 we generate by splitting larger masks in vect_get_loop_mask. */
4784 /* In the worst case, we need to generate each mask in the prologue
4785 and in the loop body. We need one splat per group and one
4788 Sometimes the prologue mask will fold to a constant,
4789 so the actual prologue cost might be smaller. However, it's
4790 simpler and safer to use the worst-case cost; if this ends up
4791 being the tie-breaker between vectorizing or not, then it's
4792 probably better not to vectorize. */
4793 (void) add_stmt_cost (target_cost_data
,
4795 + LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.length (),
4796 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4798 (void) add_stmt_cost (target_cost_data
,
4800 + LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
.length (),
4801 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0, vect_body
);
4803 /* When we need saturation we need it both in the prologue and
4805 if (need_saturation
)
4807 (void) add_stmt_cost (target_cost_data
, 1, scalar_stmt
,
4808 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4809 (void) add_stmt_cost (target_cost_data
, 1, scalar_stmt
,
4810 NULL
, NULL
, NULL_TREE
, 0, vect_body
);
4813 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
4814 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
4815 == vect_partial_vectors_while_ult
))
4817 /* Calculate how many masks we need to generate. */
4818 unsigned int num_masks
= 0;
4819 rgroup_controls
*rgm
;
4820 unsigned int num_vectors_m1
;
4821 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
,
4822 num_vectors_m1
, rgm
)
4824 num_masks
+= num_vectors_m1
+ 1;
4825 gcc_assert (num_masks
> 0);
4827 /* In the worst case, we need to generate each mask in the prologue
4828 and in the loop body. One of the loop body mask instructions
4829 replaces the comparison in the scalar loop, and since we don't
4830 count the scalar comparison against the scalar body, we shouldn't
4831 count that vector instruction against the vector body either.
4833 Sometimes we can use unpacks instead of generating prologue
4834 masks and sometimes the prologue mask will fold to a constant,
4835 so the actual prologue cost might be smaller. However, it's
4836 simpler and safer to use the worst-case cost; if this ends up
4837 being the tie-breaker between vectorizing or not, then it's
4838 probably better not to vectorize. */
4839 (void) add_stmt_cost (target_cost_data
, num_masks
,
4840 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4842 (void) add_stmt_cost (target_cost_data
, num_masks
- 1,
4843 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4846 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4848 /* Referring to the functions vect_set_loop_condition_partial_vectors
4849 and vect_set_loop_controls_directly, we need to generate each
4850 length in the prologue and in the loop body if required. Although
4851 there are some possible optimizations, we consider the worst case
4854 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4855 signed char partial_load_store_bias
4856 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
4858 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4859 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4861 /* Calculate how many statements to be added. */
4862 unsigned int prologue_stmts
= 0;
4863 unsigned int body_stmts
= 0;
4865 rgroup_controls
*rgc
;
4866 unsigned int num_vectors_m1
;
4867 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4870 /* May need one SHIFT for nitems_total computation. */
4871 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4872 if (nitems
!= 1 && !niters_known_p
)
4873 prologue_stmts
+= 1;
4875 /* May need one MAX and one MINUS for wrap around. */
4876 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4877 prologue_stmts
+= 2;
4879 /* Need one MAX and one MINUS for each batch limit excepting for
4881 prologue_stmts
+= num_vectors_m1
* 2;
4883 unsigned int num_vectors
= num_vectors_m1
+ 1;
4885 /* Need to set up lengths in prologue, only one MIN required
4886 for each since start index is zero. */
4887 prologue_stmts
+= num_vectors
;
4889 /* If we have a non-zero partial load bias, we need one PLUS
4890 to adjust the load length. */
4891 if (partial_load_store_bias
!= 0)
4894 unsigned int length_update_cost
= 0;
4895 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
))
4896 /* For decrement IV style, Each only need a single SELECT_VL
4897 or MIN since beginning to calculate the number of elements
4898 need to be processed in current iteration. */
4899 length_update_cost
= 1;
4901 /* For increment IV stype, Each may need two MINs and one MINUS to
4902 update lengths in body for next iteration. */
4903 length_update_cost
= 3;
4906 body_stmts
+= length_update_cost
* num_vectors
;
4909 (void) add_stmt_cost (target_cost_data
, prologue_stmts
,
4910 scalar_stmt
, vect_prologue
);
4911 (void) add_stmt_cost (target_cost_data
, body_stmts
,
4912 scalar_stmt
, vect_body
);
4915 /* FORNOW: The scalar outside cost is incremented in one of the
4918 1. The vectorizer checks for alignment and aliasing and generates
4919 a condition that allows dynamic vectorization. A cost model
4920 check is ANDED with the versioning condition. Hence scalar code
4921 path now has the added cost of the versioning check.
4923 if (cost > th & versioning_check)
4926 Hence run-time scalar is incremented by not-taken branch cost.
4928 2. The vectorizer then checks if a prologue is required. If the
4929 cost model check was not done before during versioning, it has to
4930 be done before the prologue check.
4933 prologue = scalar_iters
4938 if (prologue == num_iters)
4941 Hence the run-time scalar cost is incremented by a taken branch,
4942 plus a not-taken branch, plus a taken branch cost.
4944 3. The vectorizer then checks if an epilogue is required. If the
4945 cost model check was not done before during prologue check, it
4946 has to be done with the epilogue check.
4952 if (prologue == num_iters)
4955 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4958 Hence the run-time scalar cost should be incremented by 2 taken
4961 TODO: The back end may reorder the BBS's differently and reverse
4962 conditions/branch directions. Change the estimates below to
4963 something more reasonable. */
4965 /* If the number of iterations is known and we do not do versioning, we can
4966 decide whether to vectorize at compile time. Hence the scalar version
4967 do not carry cost model guard costs. */
4968 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4969 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4971 /* Cost model check occurs at versioning. */
4972 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4973 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4976 /* Cost model check occurs at prologue generation. */
4977 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4978 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4979 + vect_get_stmt_cost (cond_branch_not_taken
);
4980 /* Cost model check occurs at epilogue generation. */
4982 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4986 /* Complete the target-specific cost calculations. */
4987 finish_cost (loop_vinfo
->vector_costs
, loop_vinfo
->scalar_costs
,
4988 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
,
4989 suggested_unroll_factor
);
4991 if (suggested_unroll_factor
&& *suggested_unroll_factor
> 1
4992 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) != MAX_VECTORIZATION_FACTOR
4993 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *
4994 *suggested_unroll_factor
,
4995 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
)))
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4999 "can't unroll as unrolled vectorization factor larger"
5000 " than maximum vectorization factor: "
5001 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
5002 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
));
5003 *suggested_unroll_factor
= 1;
5006 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
5008 if (dump_enabled_p ())
5010 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
5011 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
5013 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
5015 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
5017 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
5018 scalar_single_iter_cost
);
5019 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
5020 scalar_outside_cost
);
5021 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
5023 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
5024 peel_iters_prologue
);
5025 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
5026 peel_iters_epilogue
);
5029 /* Calculate number of iterations required to make the vector version
5030 profitable, relative to the loop bodies only. The following condition
5032 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5034 SIC = scalar iteration cost, VIC = vector iteration cost,
5035 VOC = vector outside cost, VF = vectorization factor,
5036 NPEEL = prologue iterations + epilogue iterations,
5037 SOC = scalar outside cost for run time cost model check. */
5039 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
5041 if (saving_per_viter
<= 0)
5043 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
5044 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
5045 "vectorization did not happen for a simd loop");
5047 if (dump_enabled_p ())
5048 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5049 "cost model: the vector iteration cost = %d "
5050 "divided by the scalar iteration cost = %d "
5051 "is greater or equal to the vectorization factor = %d"
5053 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
5054 *ret_min_profitable_niters
= -1;
5055 *ret_min_profitable_estimate
= -1;
5059 /* ??? The "if" arm is written to handle all cases; see below for what
5060 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5061 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5063 /* Rewriting the condition above in terms of the number of
5064 vector iterations (vniters) rather than the number of
5065 scalar iterations (niters) gives:
5067 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5069 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5071 For integer N, X and Y when X > 0:
5073 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5074 int outside_overhead
= (vec_outside_cost
5075 - scalar_single_iter_cost
* peel_iters_prologue
5076 - scalar_single_iter_cost
* peel_iters_epilogue
5077 - scalar_outside_cost
);
5078 /* We're only interested in cases that require at least one
5079 vector iteration. */
5080 int min_vec_niters
= 1;
5081 if (outside_overhead
> 0)
5082 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
5084 if (dump_enabled_p ())
5085 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
5088 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5090 /* Now that we know the minimum number of vector iterations,
5091 find the minimum niters for which the scalar cost is larger:
5093 SIC * niters > VIC * vniters + VOC - SOC
5095 We know that the minimum niters is no more than
5096 vniters * VF + NPEEL, but it might be (and often is) less
5097 than that if a partial vector iteration is cheaper than the
5098 equivalent scalar code. */
5099 int threshold
= (vec_inside_cost
* min_vec_niters
5101 - scalar_outside_cost
);
5103 min_profitable_iters
= 1;
5105 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
5108 /* Convert the number of vector iterations into a number of
5109 scalar iterations. */
5110 min_profitable_iters
= (min_vec_niters
* assumed_vf
5111 + peel_iters_prologue
5112 + peel_iters_epilogue
);
5116 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
5118 - vec_inside_cost
* peel_iters_prologue
5119 - vec_inside_cost
* peel_iters_epilogue
);
5120 if (min_profitable_iters
<= 0)
5121 min_profitable_iters
= 0;
5124 min_profitable_iters
/= saving_per_viter
;
5126 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
5127 <= (((int) vec_inside_cost
* min_profitable_iters
)
5128 + (((int) vec_outside_cost
- scalar_outside_cost
)
5130 min_profitable_iters
++;
5134 if (dump_enabled_p ())
5135 dump_printf (MSG_NOTE
,
5136 " Calculated minimum iters for profitability: %d\n",
5137 min_profitable_iters
);
5139 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
5140 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
5141 /* We want the vectorized loop to execute at least once. */
5142 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
5143 else if (min_profitable_iters
< peel_iters_prologue
)
5144 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5145 vectorized loop executes at least once. */
5146 min_profitable_iters
= peel_iters_prologue
;
5148 if (dump_enabled_p ())
5149 dump_printf_loc (MSG_NOTE
, vect_location
,
5150 " Runtime profitability threshold = %d\n",
5151 min_profitable_iters
);
5153 *ret_min_profitable_niters
= min_profitable_iters
;
5155 /* Calculate number of iterations required to make the vector version
5156 profitable, relative to the loop bodies only.
5158 Non-vectorized variant is SIC * niters and it must win over vector
5159 variant on the expected loop trip count. The following condition must hold true:
5160 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5162 if (vec_outside_cost
<= 0)
5163 min_profitable_estimate
= 0;
5164 /* ??? This "else if" arm is written to handle all cases; see below for
5165 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5166 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5168 /* This is a repeat of the code above, but with + SOC rather
5170 int outside_overhead
= (vec_outside_cost
5171 - scalar_single_iter_cost
* peel_iters_prologue
5172 - scalar_single_iter_cost
* peel_iters_epilogue
5173 + scalar_outside_cost
);
5174 int min_vec_niters
= 1;
5175 if (outside_overhead
> 0)
5176 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
5178 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
5180 int threshold
= (vec_inside_cost
* min_vec_niters
5182 + scalar_outside_cost
);
5183 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
5186 min_profitable_estimate
= (min_vec_niters
* assumed_vf
5187 + peel_iters_prologue
5188 + peel_iters_epilogue
);
5192 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
5194 - vec_inside_cost
* peel_iters_prologue
5195 - vec_inside_cost
* peel_iters_epilogue
)
5196 / ((scalar_single_iter_cost
* assumed_vf
)
5199 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
5200 if (dump_enabled_p ())
5201 dump_printf_loc (MSG_NOTE
, vect_location
,
5202 " Static estimate profitability threshold = %d\n",
5203 min_profitable_estimate
);
5205 *ret_min_profitable_estimate
= min_profitable_estimate
;
5208 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5209 vector elements (not bits) for a vector with NELT elements. */
5211 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
5212 vec_perm_builder
*sel
)
5214 /* The encoding is a single stepped pattern. Any wrap-around is handled
5215 by vec_perm_indices. */
5216 sel
->new_vector (nelt
, 1, 3);
5217 for (unsigned int i
= 0; i
< 3; i
++)
5218 sel
->quick_push (i
+ offset
);
5221 /* Checks whether the target supports whole-vector shifts for vectors of mode
5222 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5223 it supports vec_perm_const with masks for all necessary shift amounts. */
5225 have_whole_vector_shift (machine_mode mode
)
5227 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
5230 /* Variable-length vectors should be handled via the optab. */
5232 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5235 vec_perm_builder sel
;
5236 vec_perm_indices indices
;
5237 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
5239 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
5240 indices
.new_vector (sel
, 2, nelt
);
5241 if (!can_vec_perm_const_p (mode
, mode
, indices
, false))
5247 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5248 multiplication operands have differing signs and (b) we intend
5249 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5250 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5253 vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info
)
5255 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
5256 if (!assign
|| gimple_assign_rhs_code (assign
) != DOT_PROD_EXPR
)
5259 tree rhs1
= gimple_assign_rhs1 (assign
);
5260 tree rhs2
= gimple_assign_rhs2 (assign
);
5261 if (TYPE_SIGN (TREE_TYPE (rhs1
)) == TYPE_SIGN (TREE_TYPE (rhs2
)))
5264 gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info
));
5265 return !directly_supported_p (DOT_PROD_EXPR
,
5266 STMT_VINFO_VECTYPE (stmt_info
),
5267 STMT_VINFO_REDUC_VECTYPE_IN (stmt_info
),
5268 optab_vector_mixed_sign
);
5271 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5272 functions. Design better to avoid maintenance issues. */
5274 /* Function vect_model_reduction_cost.
5276 Models cost for a reduction operation, including the vector ops
5277 generated within the strip-mine loop in some cases, the initial
5278 definition before the loop, and the epilogue code that must be generated. */
5281 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
5282 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
5283 vect_reduction_type reduction_type
,
5284 int ncopies
, stmt_vector_for_cost
*cost_vec
)
5286 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
= 0;
5289 class loop
*loop
= NULL
;
5292 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5294 /* Condition reductions generate two reductions in the loop. */
5295 if (reduction_type
== COND_REDUCTION
)
5298 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5299 mode
= TYPE_MODE (vectype
);
5300 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5303 if (!gimple_extract_op (orig_stmt_info
->stmt
, &op
))
5306 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
5307 /* No extra instructions are needed in the prologue. The loop body
5308 operations are costed in vectorizable_condition. */
5310 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
5312 /* No extra instructions needed in the prologue. */
5315 if (reduc_fn
!= IFN_LAST
)
5316 /* Count one reduction-like operation per vector. */
5317 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
5318 stmt_info
, 0, vect_body
);
5321 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5322 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
5323 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
5324 vec_to_scalar
, stmt_info
, 0,
5326 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
5327 scalar_stmt
, stmt_info
, 0,
5333 /* Add in the cost of the initial definitions. */
5335 if (reduction_type
== COND_REDUCTION
)
5336 /* For cond reductions we have four vectors: initial index, step,
5337 initial result of the data reduction, initial value of the index
5341 /* We need the initial reduction value. */
5343 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
5344 scalar_to_vec
, stmt_info
, 0,
5348 /* Determine cost of epilogue code.
5350 We have a reduction operator that will reduce the vector in one statement.
5351 Also requires scalar extract. */
5353 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
5355 if (reduc_fn
!= IFN_LAST
)
5357 if (reduction_type
== COND_REDUCTION
)
5359 /* An EQ stmt and an COND_EXPR stmt. */
5360 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
5361 vector_stmt
, stmt_info
, 0,
5363 /* Reduction of the max index and a reduction of the found
5365 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
5366 vec_to_scalar
, stmt_info
, 0,
5368 /* A broadcast of the max value. */
5369 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5370 scalar_to_vec
, stmt_info
, 0,
5375 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
5376 stmt_info
, 0, vect_epilogue
);
5377 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5378 vec_to_scalar
, stmt_info
, 0,
5382 else if (reduction_type
== COND_REDUCTION
)
5384 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
5385 /* Extraction of scalar elements. */
5386 epilogue_cost
+= record_stmt_cost (cost_vec
,
5387 2 * estimated_nunits
,
5388 vec_to_scalar
, stmt_info
, 0,
5390 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5391 epilogue_cost
+= record_stmt_cost (cost_vec
,
5392 2 * estimated_nunits
- 3,
5393 scalar_stmt
, stmt_info
, 0,
5396 else if (reduction_type
== EXTRACT_LAST_REDUCTION
5397 || reduction_type
== FOLD_LEFT_REDUCTION
)
5398 /* No extra instructions need in the epilogue. */
5402 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5403 tree bitsize
= TYPE_SIZE (op
.type
);
5404 int element_bitsize
= tree_to_uhwi (bitsize
);
5405 int nelements
= vec_size_in_bits
/ element_bitsize
;
5407 if (op
.code
== COND_EXPR
)
5410 /* We have a whole vector shift available. */
5411 if (VECTOR_MODE_P (mode
)
5412 && directly_supported_p (op
.code
, vectype
)
5413 && have_whole_vector_shift (mode
))
5415 /* Final reduction via vector shifts and the reduction operator.
5416 Also requires scalar extract. */
5417 epilogue_cost
+= record_stmt_cost (cost_vec
,
5418 exact_log2 (nelements
) * 2,
5419 vector_stmt
, stmt_info
, 0,
5421 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
5422 vec_to_scalar
, stmt_info
, 0,
5426 /* Use extracts and reduction op for final reduction. For N
5427 elements, we have N extracts and N-1 reduction ops. */
5428 epilogue_cost
+= record_stmt_cost (cost_vec
,
5429 nelements
+ nelements
- 1,
5430 vector_stmt
, stmt_info
, 0,
5435 if (dump_enabled_p ())
5436 dump_printf (MSG_NOTE
,
5437 "vect_model_reduction_cost: inside_cost = %d, "
5438 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
5439 prologue_cost
, epilogue_cost
);
5442 /* SEQ is a sequence of instructions that initialize the reduction
5443 described by REDUC_INFO. Emit them in the appropriate place. */
5446 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo
,
5447 stmt_vec_info reduc_info
, gimple
*seq
)
5449 if (reduc_info
->reused_accumulator
)
5451 /* When reusing an accumulator from the main loop, we only need
5452 initialization instructions if the main loop can be skipped.
5453 In that case, emit the initialization instructions at the end
5454 of the guard block that does the skip. */
5455 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5456 gcc_assert (skip_edge
);
5457 gimple_stmt_iterator gsi
= gsi_last_bb (skip_edge
->src
);
5458 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
5462 /* The normal case: emit the initialization instructions on the
5464 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5465 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), seq
);
5469 /* Function get_initial_def_for_reduction
5472 REDUC_INFO - the info_for_reduction
5473 INIT_VAL - the initial value of the reduction variable
5474 NEUTRAL_OP - a value that has no effect on the reduction, as per
5475 neutral_op_for_reduction
5478 Return a vector variable, initialized according to the operation that
5479 STMT_VINFO performs. This vector will be used as the initial value
5480 of the vector of partial results.
5482 The value we need is a vector in which element 0 has value INIT_VAL
5483 and every other element has value NEUTRAL_OP. */
5486 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
5487 stmt_vec_info reduc_info
,
5488 tree init_val
, tree neutral_op
)
5490 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5491 tree scalar_type
= TREE_TYPE (init_val
);
5492 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
5494 gimple_seq stmts
= NULL
;
5496 gcc_assert (vectype
);
5498 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
5499 || SCALAR_FLOAT_TYPE_P (scalar_type
));
5501 gcc_assert (nested_in_vect_loop_p (loop
, reduc_info
)
5502 || loop
== (gimple_bb (reduc_info
->stmt
))->loop_father
);
5504 if (operand_equal_p (init_val
, neutral_op
))
5506 /* If both elements are equal then the vector described above is
5508 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
5509 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, neutral_op
);
5513 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
5514 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
5515 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
5517 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5519 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
5521 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
5522 vectype
, init_def
, init_val
);
5526 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5527 tree_vector_builder
elts (vectype
, 1, 2);
5528 elts
.quick_push (init_val
);
5529 elts
.quick_push (neutral_op
);
5530 init_def
= gimple_build_vector (&stmts
, &elts
);
5535 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, stmts
);
5539 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5540 which performs a reduction involving GROUP_SIZE scalar statements.
5541 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5542 is nonnull, introducing extra elements of that value will not change the
5546 get_initial_defs_for_reduction (loop_vec_info loop_vinfo
,
5547 stmt_vec_info reduc_info
,
5548 vec
<tree
> *vec_oprnds
,
5549 unsigned int number_of_vectors
,
5550 unsigned int group_size
, tree neutral_op
)
5552 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
5553 unsigned HOST_WIDE_INT nunits
;
5554 unsigned j
, number_of_places_left_in_vector
;
5555 tree vector_type
= STMT_VINFO_VECTYPE (reduc_info
);
5558 gcc_assert (group_size
== initial_values
.length () || neutral_op
);
5560 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5561 created vectors. It is greater than 1 if unrolling is performed.
5563 For example, we have two scalar operands, s1 and s2 (e.g., group of
5564 strided accesses of size two), while NUNITS is four (i.e., four scalars
5565 of this type can be packed in a vector). The output vector will contain
5566 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5569 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5570 vectors containing the operands.
5572 For example, NUNITS is four as before, and the group size is 8
5573 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5574 {s5, s6, s7, s8}. */
5576 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5577 nunits
= group_size
;
5579 number_of_places_left_in_vector
= nunits
;
5580 bool constant_p
= true;
5581 tree_vector_builder
elts (vector_type
, nunits
, 1);
5582 elts
.quick_grow (nunits
);
5583 gimple_seq ctor_seq
= NULL
;
5585 && !useless_type_conversion_p (TREE_TYPE (vector_type
),
5586 TREE_TYPE (neutral_op
)))
5587 neutral_op
= gimple_convert (&ctor_seq
,
5588 TREE_TYPE (vector_type
),
5590 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
5595 /* Get the def before the loop. In reduction chain we have only
5596 one initial value. Else we have as many as PHIs in the group. */
5597 if (i
>= initial_values
.length () || (j
> i
&& neutral_op
))
5601 if (!useless_type_conversion_p (TREE_TYPE (vector_type
),
5602 TREE_TYPE (initial_values
[i
])))
5603 initial_values
[i
] = gimple_convert (&ctor_seq
,
5604 TREE_TYPE (vector_type
),
5606 op
= initial_values
[i
];
5609 /* Create 'vect_ = {op0,op1,...,opn}'. */
5610 number_of_places_left_in_vector
--;
5611 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
5612 if (!CONSTANT_CLASS_P (op
))
5615 if (number_of_places_left_in_vector
== 0)
5618 if (constant_p
&& !neutral_op
5619 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5620 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5621 /* Build the vector directly from ELTS. */
5622 init
= gimple_build_vector (&ctor_seq
, &elts
);
5623 else if (neutral_op
)
5625 /* Build a vector of the neutral value and shift the
5626 other elements into place. */
5627 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
5630 while (k
> 0 && operand_equal_p (elts
[k
- 1], neutral_op
))
5635 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
5636 vector_type
, init
, elts
[k
]);
5641 /* First time round, duplicate ELTS to fill the
5642 required number of vectors. */
5643 duplicate_and_interleave (loop_vinfo
, &ctor_seq
, vector_type
,
5644 elts
, number_of_vectors
, *vec_oprnds
);
5647 vec_oprnds
->quick_push (init
);
5649 number_of_places_left_in_vector
= nunits
;
5650 elts
.new_vector (vector_type
, nunits
, 1);
5651 elts
.quick_grow (nunits
);
5655 if (ctor_seq
!= NULL
)
5656 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, ctor_seq
);
5659 /* For a statement STMT_INFO taking part in a reduction operation return
5660 the stmt_vec_info the meta information is stored on. */
5663 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
5665 stmt_info
= vect_orig_stmt (stmt_info
);
5666 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
5667 if (!is_a
<gphi
*> (stmt_info
->stmt
)
5668 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
5669 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5670 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
5671 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5673 if (gimple_phi_num_args (phi
) == 1)
5674 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5676 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
5678 stmt_vec_info info
= vinfo
->lookup_def (vect_phi_initial_value (phi
));
5679 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
5685 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5686 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5690 vect_find_reusable_accumulator (loop_vec_info loop_vinfo
,
5691 stmt_vec_info reduc_info
)
5693 loop_vec_info main_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
5694 if (!main_loop_vinfo
)
5697 if (STMT_VINFO_REDUC_TYPE (reduc_info
) != TREE_CODE_REDUCTION
)
5700 unsigned int num_phis
= reduc_info
->reduc_initial_values
.length ();
5701 auto_vec
<tree
, 16> main_loop_results (num_phis
);
5702 auto_vec
<tree
, 16> initial_values (num_phis
);
5703 if (edge main_loop_edge
= loop_vinfo
->main_loop_edge
)
5705 /* The epilogue loop can be entered either from the main loop or
5706 from an earlier guard block. */
5707 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5708 for (tree incoming_value
: reduc_info
->reduc_initial_values
)
5712 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5713 INITIAL_VALUE(guard block)>. */
5714 gcc_assert (TREE_CODE (incoming_value
) == SSA_NAME
);
5716 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (incoming_value
));
5717 gcc_assert (gimple_bb (phi
) == main_loop_edge
->dest
);
5719 tree from_main_loop
= PHI_ARG_DEF_FROM_EDGE (phi
, main_loop_edge
);
5720 tree from_skip
= PHI_ARG_DEF_FROM_EDGE (phi
, skip_edge
);
5722 main_loop_results
.quick_push (from_main_loop
);
5723 initial_values
.quick_push (from_skip
);
5727 /* The main loop dominates the epilogue loop. */
5728 main_loop_results
.splice (reduc_info
->reduc_initial_values
);
5730 /* See if the main loop has the kind of accumulator we need. */
5731 vect_reusable_accumulator
*accumulator
5732 = main_loop_vinfo
->reusable_accumulators
.get (main_loop_results
[0]);
5734 || num_phis
!= accumulator
->reduc_info
->reduc_scalar_results
.length ()
5735 || !std::equal (main_loop_results
.begin (), main_loop_results
.end (),
5736 accumulator
->reduc_info
->reduc_scalar_results
.begin ()))
5739 /* Handle the case where we can reduce wider vectors to narrower ones. */
5740 tree vectype
= STMT_VINFO_VECTYPE (reduc_info
);
5741 tree old_vectype
= TREE_TYPE (accumulator
->reduc_input
);
5742 unsigned HOST_WIDE_INT m
;
5743 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype
),
5744 TYPE_VECTOR_SUBPARTS (vectype
), &m
))
5746 /* Check the intermediate vector types and operations are available. */
5747 tree prev_vectype
= old_vectype
;
5748 poly_uint64 intermediate_nunits
= TYPE_VECTOR_SUBPARTS (old_vectype
);
5749 while (known_gt (intermediate_nunits
, TYPE_VECTOR_SUBPARTS (vectype
)))
5751 intermediate_nunits
= exact_div (intermediate_nunits
, 2);
5752 tree intermediate_vectype
= get_related_vectype_for_scalar_type
5753 (TYPE_MODE (vectype
), TREE_TYPE (vectype
), intermediate_nunits
);
5754 if (!intermediate_vectype
5755 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info
),
5756 intermediate_vectype
)
5757 || !can_vec_extract (TYPE_MODE (prev_vectype
),
5758 TYPE_MODE (intermediate_vectype
)))
5760 prev_vectype
= intermediate_vectype
;
5763 /* Non-SLP reductions might apply an adjustment after the reduction
5764 operation, in order to simplify the initialization of the accumulator.
5765 If the epilogue loop carries on from where the main loop left off,
5766 it should apply the same adjustment to the final reduction result.
5768 If the epilogue loop can also be entered directly (rather than via
5769 the main loop), we need to be able to handle that case in the same way,
5770 with the same adjustment. (In principle we could add a PHI node
5771 to select the correct adjustment, but in practice that shouldn't be
5773 tree main_adjustment
5774 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator
->reduc_info
);
5775 if (loop_vinfo
->main_loop_edge
&& main_adjustment
)
5777 gcc_assert (num_phis
== 1);
5778 tree initial_value
= initial_values
[0];
5779 /* Check that we can use INITIAL_VALUE as the adjustment and
5780 initialize the accumulator with a neutral value instead. */
5781 if (!operand_equal_p (initial_value
, main_adjustment
))
5783 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5784 initial_values
[0] = neutral_op_for_reduction (TREE_TYPE (initial_value
),
5785 code
, initial_value
);
5787 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = main_adjustment
;
5788 reduc_info
->reduc_initial_values
.truncate (0);
5789 reduc_info
->reduc_initial_values
.splice (initial_values
);
5790 reduc_info
->reused_accumulator
= accumulator
;
5794 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5795 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5798 vect_create_partial_epilog (tree vec_def
, tree vectype
, code_helper code
,
5801 unsigned nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def
)).to_constant ();
5802 unsigned nunits1
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5803 tree stype
= TREE_TYPE (vectype
);
5804 tree new_temp
= vec_def
;
5805 while (nunits
> nunits1
)
5808 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5810 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5812 /* The target has to make sure we support lowpart/highpart
5813 extraction, either via direct vector extract or through
5814 an integer mode punning. */
5816 gimple
*epilog_stmt
;
5817 if (convert_optab_handler (vec_extract_optab
,
5818 TYPE_MODE (TREE_TYPE (new_temp
)),
5819 TYPE_MODE (vectype1
))
5820 != CODE_FOR_nothing
)
5822 /* Extract sub-vectors directly once vec_extract becomes
5823 a conversion optab. */
5824 dst1
= make_ssa_name (vectype1
);
5826 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5827 build3 (BIT_FIELD_REF
, vectype1
,
5828 new_temp
, TYPE_SIZE (vectype1
),
5830 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5831 dst2
= make_ssa_name (vectype1
);
5833 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5834 build3 (BIT_FIELD_REF
, vectype1
,
5835 new_temp
, TYPE_SIZE (vectype1
),
5836 bitsize_int (bitsize
)));
5837 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5841 /* Extract via punning to appropriately sized integer mode
5843 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5844 tree etype
= build_vector_type (eltype
, 2);
5845 gcc_assert (convert_optab_handler (vec_extract_optab
,
5848 != CODE_FOR_nothing
);
5849 tree tem
= make_ssa_name (etype
);
5850 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5851 build1 (VIEW_CONVERT_EXPR
,
5853 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5855 tem
= make_ssa_name (eltype
);
5857 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5858 build3 (BIT_FIELD_REF
, eltype
,
5859 new_temp
, TYPE_SIZE (eltype
),
5861 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5862 dst1
= make_ssa_name (vectype1
);
5863 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5864 build1 (VIEW_CONVERT_EXPR
,
5866 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5867 tem
= make_ssa_name (eltype
);
5869 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5870 build3 (BIT_FIELD_REF
, eltype
,
5871 new_temp
, TYPE_SIZE (eltype
),
5872 bitsize_int (bitsize
)));
5873 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5874 dst2
= make_ssa_name (vectype1
);
5875 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5876 build1 (VIEW_CONVERT_EXPR
,
5878 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5881 new_temp
= gimple_build (seq
, code
, vectype1
, dst1
, dst2
);
5887 /* Function vect_create_epilog_for_reduction
5889 Create code at the loop-epilog to finalize the result of a reduction
5892 STMT_INFO is the scalar reduction stmt that is being vectorized.
5893 SLP_NODE is an SLP node containing a group of reduction statements. The
5894 first one in this group is STMT_INFO.
5895 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5896 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5898 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5899 exit this edge is always the main loop exit.
5902 1. Completes the reduction def-use cycles.
5903 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5904 by calling the function specified by REDUC_FN if available, or by
5905 other means (whole-vector shifts or a scalar loop).
5906 The function also creates a new phi node at the loop exit to preserve
5907 loop-closed form, as illustrated below.
5909 The flow at the entry to this function:
5912 vec_def = phi <vec_init, null> # REDUCTION_PHI
5913 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5914 s_loop = scalar_stmt # (scalar) STMT_INFO
5916 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5920 The above is transformed by this function into:
5923 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5924 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5925 s_loop = scalar_stmt # (scalar) STMT_INFO
5927 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5928 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5929 v_out2 = reduce <v_out1>
5930 s_out3 = extract_field <v_out2, 0>
5931 s_out4 = adjust_result <s_out3>
5937 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
5938 stmt_vec_info stmt_info
,
5940 slp_instance slp_node_instance
,
5943 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5944 gcc_assert (reduc_info
->is_reduc_info
);
5945 /* For double reductions we need to get at the inner loop reduction
5946 stmt which has the meta info attached. Our stmt_info is that of the
5947 loop-closed PHI of the inner loop which we remember as
5948 def for the reduction PHI generation. */
5949 bool double_reduc
= false;
5950 stmt_vec_info rdef_info
= stmt_info
;
5951 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5953 double_reduc
= true;
5954 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
5955 (stmt_info
->stmt
, 0));
5956 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
5958 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5959 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
5962 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
5963 basic_block exit_bb
;
5966 gimple
*new_phi
= NULL
, *phi
= NULL
;
5967 gimple_stmt_iterator exit_gsi
;
5968 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
5969 gimple
*epilog_stmt
= NULL
;
5973 tree orig_name
, scalar_result
;
5974 imm_use_iterator imm_iter
, phi_imm_iter
;
5975 use_operand_p use_p
, phi_use_p
;
5977 auto_vec
<tree
> reduc_inputs
;
5979 vec
<tree
> &scalar_results
= reduc_info
->reduc_scalar_results
;
5980 unsigned int group_size
= 1, k
;
5981 /* SLP reduction without reduction chain, e.g.,
5985 b2 = operation (b1) */
5986 bool slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5987 bool direct_slp_reduc
;
5988 tree induction_index
= NULL_TREE
;
5991 group_size
= SLP_TREE_LANES (slp_node
);
5993 if (nested_in_vect_loop_p (loop
, stmt_info
))
5997 gcc_assert (double_reduc
);
6000 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
6001 gcc_assert (vectype
);
6002 mode
= TYPE_MODE (vectype
);
6004 tree induc_val
= NULL_TREE
;
6005 tree adjustment_def
= NULL
;
6006 /* Optimize: for induction condition reduction, if we can't use zero
6007 for induc_val, use initial_def. */
6008 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6009 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
6010 else if (double_reduc
)
6013 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
6015 stmt_vec_info single_live_out_stmt
[] = { stmt_info
};
6016 array_slice
<const stmt_vec_info
> live_out_stmts
= single_live_out_stmt
;
6018 /* All statements produce live-out values. */
6019 live_out_stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
6025 vec_num
= SLP_TREE_VEC_DEFS (slp_node_instance
->reduc_phis
).length ();
6031 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
6034 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6035 which is updated with the current index of the loop for every match of
6036 the original loop's cond_expr (VEC_STMT). This results in a vector
6037 containing the last time the condition passed for that vector lane.
6038 The first match will be a 1 to allow 0 to be used for non-matching
6039 indexes. If there are no matches at all then the vector will be all
6042 PR92772: This algorithm is broken for architectures that support
6043 masked vectors, but do not provide fold_extract_last. */
6044 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
6046 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
6049 slp_tree cond_node
= slp_node_instance
->root
;
6050 while (cond_node
!= slp_node_instance
->reduc_phis
)
6052 stmt_vec_info cond_info
= SLP_TREE_REPRESENTATIVE (cond_node
);
6054 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
6057 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node
)[0]);
6058 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
6060 (std::make_pair (gimple_assign_rhs1 (vec_stmt
),
6061 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
6062 /* ??? We probably want to have REDUC_IDX on the SLP node?
6063 We have both three and four children COND_EXPR nodes
6064 dependent on whether the comparison is still embedded
6065 as GENERIC. So work backwards. */
6066 slp_reduc_idx
= (SLP_TREE_CHILDREN (cond_node
).length () - 3
6067 + STMT_VINFO_REDUC_IDX (cond_info
));
6070 slp_reduc_idx
= STMT_VINFO_REDUC_IDX (cond_info
);
6071 cond_node
= SLP_TREE_CHILDREN (cond_node
)[slp_reduc_idx
];
6076 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
6077 cond_info
= vect_stmt_to_vectorize (cond_info
);
6078 while (cond_info
!= reduc_info
)
6080 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
6082 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
6083 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
6085 (std::make_pair (gimple_assign_rhs1 (vec_stmt
),
6086 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
6089 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
6090 1 + STMT_VINFO_REDUC_IDX
6092 cond_info
= vect_stmt_to_vectorize (cond_info
);
6095 gcc_assert (ccompares
.length () != 0);
6097 tree indx_before_incr
, indx_after_incr
;
6098 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
6099 int scalar_precision
6100 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
6101 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6102 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
6103 (TYPE_MODE (vectype
), cr_index_scalar_type
,
6104 TYPE_VECTOR_SUBPARTS (vectype
));
6106 /* First we create a simple vector induction variable which starts
6107 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6108 vector size (STEP). */
6110 /* Create a {1,2,3,...} vector. */
6111 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
6113 /* Create a vector of the step value. */
6114 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
6115 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
6117 /* Create an induction variable. */
6118 gimple_stmt_iterator incr_gsi
;
6120 vect_iv_increment_position (loop_exit
, &incr_gsi
, &insert_after
);
6121 create_iv (series_vect
, PLUS_EXPR
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
6122 insert_after
, &indx_before_incr
, &indx_after_incr
);
6124 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6125 filled with zeros (VEC_ZERO). */
6127 /* Create a vector of 0s. */
6128 tree zero
= build_zero_cst (cr_index_scalar_type
);
6129 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
6131 /* Create a vector phi node. */
6132 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
6133 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
6134 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
6135 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
6137 /* Now take the condition from the loops original cond_exprs
6138 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6139 every match uses values from the induction variable
6140 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6142 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6143 the new cond_expr (INDEX_COND_EXPR). */
6144 gimple_seq stmts
= NULL
;
6145 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
6147 tree ccompare
= ccompares
[i
].first
;
6148 if (ccompares
[i
].second
)
6149 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
6150 cr_index_vector_type
,
6152 indx_before_incr
, new_phi_tree
);
6154 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
6155 cr_index_vector_type
,
6157 new_phi_tree
, indx_before_incr
);
6159 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
6161 /* Update the phi with the vec cond. */
6162 induction_index
= new_phi_tree
;
6163 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
6164 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
6167 /* 2. Create epilog code.
6168 The reduction epilog code operates across the elements of the vector
6169 of partial results computed by the vectorized loop.
6170 The reduction epilog code consists of:
6172 step 1: compute the scalar result in a vector (v_out2)
6173 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6174 step 3: adjust the scalar result (s_out3) if needed.
6176 Step 1 can be accomplished using one the following three schemes:
6177 (scheme 1) using reduc_fn, if available.
6178 (scheme 2) using whole-vector shifts, if available.
6179 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6182 The overall epilog code looks like this:
6184 s_out0 = phi <s_loop> # original EXIT_PHI
6185 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6186 v_out2 = reduce <v_out1> # step 1
6187 s_out3 = extract_field <v_out2, 0> # step 2
6188 s_out4 = adjust_result <s_out3> # step 3
6190 (step 3 is optional, and steps 1 and 2 may be combined).
6191 Lastly, the uses of s_out0 are replaced by s_out4. */
6194 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6195 v_out1 = phi <VECT_DEF>
6196 Store them in NEW_PHIS. */
6199 /* We need to reduce values in all exits. */
6200 exit_bb
= loop_exit
->dest
;
6201 exit_gsi
= gsi_after_labels (exit_bb
);
6202 reduc_inputs
.create (slp_node
? vec_num
: ncopies
);
6203 for (unsigned i
= 0; i
< vec_num
; i
++)
6205 gimple_seq stmts
= NULL
;
6207 def
= vect_get_slp_vect_def (slp_node
, i
);
6209 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
6210 for (j
= 0; j
< ncopies
; j
++)
6212 tree new_def
= copy_ssa_name (def
);
6213 phi
= create_phi_node (new_def
, exit_bb
);
6215 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
6216 if (LOOP_VINFO_IV_EXIT (loop_vinfo
) == loop_exit
)
6217 SET_PHI_ARG_DEF (phi
, loop_exit
->dest_idx
, def
);
6220 for (unsigned k
= 0; k
< gimple_phi_num_args (phi
); k
++)
6221 SET_PHI_ARG_DEF (phi
, k
, def
);
6223 new_def
= gimple_convert (&stmts
, vectype
, new_def
);
6224 reduc_inputs
.quick_push (new_def
);
6226 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6229 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6230 (i.e. when reduc_fn is not available) and in the final adjustment
6231 code (if needed). Also get the original scalar reduction variable as
6232 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6233 represents a reduction pattern), the tree-code and scalar-def are
6234 taken from the original stmt that the pattern-stmt (STMT) replaces.
6235 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6236 are taken from STMT. */
6238 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
6239 if (orig_stmt_info
!= stmt_info
)
6241 /* Reduction pattern */
6242 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6243 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
6246 scalar_dest
= gimple_get_lhs (orig_stmt_info
->stmt
);
6247 scalar_type
= TREE_TYPE (scalar_dest
);
6248 scalar_results
.truncate (0);
6249 scalar_results
.reserve_exact (group_size
);
6250 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
6251 bitsize
= TYPE_SIZE (scalar_type
);
6253 /* True if we should implement SLP_REDUC using native reduction operations
6254 instead of scalar operations. */
6255 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
6257 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
6259 /* In case of reduction chain, e.g.,
6262 a3 = operation (a2),
6264 we may end up with more than one vector result. Here we reduce them
6267 The same is true for a SLP reduction, e.g.,
6271 b2 = operation (a2),
6273 where we can end up with more than one vector as well. We can
6274 easily accumulate vectors when the number of vector elements is
6275 a multiple of the SLP group size.
6277 The same is true if we couldn't use a single defuse cycle. */
6278 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6281 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
), group_size
))
6284 gimple_seq stmts
= NULL
;
6285 tree single_input
= reduc_inputs
[0];
6286 for (k
= 1; k
< reduc_inputs
.length (); k
++)
6287 single_input
= gimple_build (&stmts
, code
, vectype
,
6288 single_input
, reduc_inputs
[k
]);
6289 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6291 reduc_inputs
.truncate (0);
6292 reduc_inputs
.safe_push (single_input
);
6295 tree orig_reduc_input
= reduc_inputs
[0];
6297 /* If this loop is an epilogue loop that can be skipped after the
6298 main loop, we can only share a reduction operation between the
6299 main loop and the epilogue if we put it at the target of the
6302 We can still reuse accumulators if this check fails. Doing so has
6303 the minor(?) benefit of making the epilogue loop's scalar result
6304 independent of the main loop's scalar result. */
6305 bool unify_with_main_loop_p
= false;
6306 if (reduc_info
->reused_accumulator
6307 && loop_vinfo
->skip_this_loop_edge
6308 && single_succ_p (exit_bb
)
6309 && single_succ (exit_bb
) == loop_vinfo
->skip_this_loop_edge
->dest
)
6311 unify_with_main_loop_p
= true;
6313 basic_block reduc_block
= loop_vinfo
->skip_this_loop_edge
->dest
;
6314 reduc_inputs
[0] = make_ssa_name (vectype
);
6315 gphi
*new_phi
= create_phi_node (reduc_inputs
[0], reduc_block
);
6316 add_phi_arg (new_phi
, orig_reduc_input
, single_succ_edge (exit_bb
),
6318 add_phi_arg (new_phi
, reduc_info
->reused_accumulator
->reduc_input
,
6319 loop_vinfo
->skip_this_loop_edge
, UNKNOWN_LOCATION
);
6320 exit_gsi
= gsi_after_labels (reduc_block
);
6323 /* Shouldn't be used beyond this point. */
6326 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
6327 && reduc_fn
!= IFN_LAST
)
6329 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6330 various data values where the condition matched and another vector
6331 (INDUCTION_INDEX) containing all the indexes of those matches. We
6332 need to extract the last matching index (which will be the index with
6333 highest value) and use this to index into the data vector.
6334 For the case where there were no matches, the data vector will contain
6335 all default values and the index vector will be all zeros. */
6337 /* Get various versions of the type of the vector of indexes. */
6338 tree index_vec_type
= TREE_TYPE (induction_index
);
6339 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
6340 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
6341 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
6343 /* Get an unsigned integer version of the type of the data vector. */
6344 int scalar_precision
6345 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6346 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
6347 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
6350 /* First we need to create a vector (ZERO_VEC) of zeros and another
6351 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6352 can create using a MAX reduction and then expanding.
6353 In the case where the loop never made any matches, the max index will
6356 /* Vector of {0, 0, 0,...}. */
6357 tree zero_vec
= build_zero_cst (vectype
);
6359 /* Find maximum value from the vector of found indexes. */
6360 tree max_index
= make_ssa_name (index_scalar_type
);
6361 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
6362 1, induction_index
);
6363 gimple_call_set_lhs (max_index_stmt
, max_index
);
6364 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
6366 /* Vector of {max_index, max_index, max_index,...}. */
6367 tree max_index_vec
= make_ssa_name (index_vec_type
);
6368 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
6370 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
6372 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
6374 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6375 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6376 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6377 otherwise. Only one value should match, resulting in a vector
6378 (VEC_COND) with one data value and the rest zeros.
6379 In the case where the loop never made any matches, every index will
6380 match, resulting in a vector with all data values (which will all be
6381 the default value). */
6383 /* Compare the max index vector to the vector of found indexes to find
6384 the position of the max value. */
6385 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
6386 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
6389 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
6391 /* Use the compare to choose either values from the data vector or
6393 tree vec_cond
= make_ssa_name (vectype
);
6394 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
6398 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
6400 /* Finally we need to extract the data value from the vector (VEC_COND)
6401 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6402 reduction, but because this doesn't exist, we can use a MAX reduction
6403 instead. The data value might be signed or a float so we need to cast
6405 In the case where the loop never made any matches, the data values are
6406 all identical, and so will reduce down correctly. */
6408 /* Make the matched data values unsigned. */
6409 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
6410 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
6412 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
6415 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
6417 /* Reduce down to a scalar value. */
6418 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
6419 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
6421 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
6422 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
6424 /* Convert the reduced value back to the result type and set as the
6426 gimple_seq stmts
= NULL
;
6427 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
6429 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6430 scalar_results
.safe_push (new_temp
);
6432 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
6433 && reduc_fn
== IFN_LAST
)
6435 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6437 idx_val = induction_index[0];
6438 val = data_reduc[0];
6439 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6440 if (induction_index[i] > idx_val)
6441 val = data_reduc[i], idx_val = induction_index[i];
6444 tree data_eltype
= TREE_TYPE (vectype
);
6445 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
6446 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
6447 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
6448 /* Enforced by vectorizable_reduction, which ensures we have target
6449 support before allowing a conditional reduction on variable-length
6451 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
6452 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
6453 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
6455 tree old_idx_val
= idx_val
;
6457 idx_val
= make_ssa_name (idx_eltype
);
6458 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
6459 build3 (BIT_FIELD_REF
, idx_eltype
,
6461 bitsize_int (el_size
),
6462 bitsize_int (off
)));
6463 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6464 val
= make_ssa_name (data_eltype
);
6465 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
6466 build3 (BIT_FIELD_REF
,
6469 bitsize_int (el_size
),
6470 bitsize_int (off
)));
6471 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6474 tree new_idx_val
= idx_val
;
6475 if (off
!= v_size
- el_size
)
6477 new_idx_val
= make_ssa_name (idx_eltype
);
6478 epilog_stmt
= gimple_build_assign (new_idx_val
,
6481 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6483 tree cond
= make_ssa_name (boolean_type_node
);
6484 epilog_stmt
= gimple_build_assign (cond
, GT_EXPR
,
6485 idx_val
, old_idx_val
);
6486 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6487 tree new_val
= make_ssa_name (data_eltype
);
6488 epilog_stmt
= gimple_build_assign (new_val
, COND_EXPR
,
6489 cond
, val
, old_val
);
6490 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6491 idx_val
= new_idx_val
;
6495 /* Convert the reduced value back to the result type and set as the
6497 gimple_seq stmts
= NULL
;
6498 val
= gimple_convert (&stmts
, scalar_type
, val
);
6499 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6500 scalar_results
.safe_push (val
);
6503 /* 2.3 Create the reduction code, using one of the three schemes described
6504 above. In SLP we simply need to extract all the elements from the
6505 vector (without reducing them), so we use scalar shifts. */
6506 else if (reduc_fn
!= IFN_LAST
&& (!slp_reduc
|| group_size
== 1))
6512 v_out2 = reduc_expr <v_out1> */
6514 if (dump_enabled_p ())
6515 dump_printf_loc (MSG_NOTE
, vect_location
,
6516 "Reduce using direct vector reduction.\n");
6518 gimple_seq stmts
= NULL
;
6519 vec_elem_type
= TREE_TYPE (vectype
);
6520 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
6521 vec_elem_type
, reduc_inputs
[0]);
6522 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6523 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6525 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6528 /* Earlier we set the initial value to be a vector if induc_val
6529 values. Check the result and if it is induc_val then replace
6530 with the original initial value, unless induc_val is
6531 the same as initial_def already. */
6532 tree zcompare
= make_ssa_name (boolean_type_node
);
6533 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
,
6534 new_temp
, induc_val
);
6535 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6536 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6537 tmp
= make_ssa_name (new_scalar_dest
);
6538 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6539 initial_def
, new_temp
);
6540 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6544 scalar_results
.safe_push (new_temp
);
6546 else if (direct_slp_reduc
)
6548 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6549 with the elements for other SLP statements replaced with the
6550 neutral value. We can then do a normal reduction on each vector. */
6552 /* Enforced by vectorizable_reduction. */
6553 gcc_assert (reduc_inputs
.length () == 1);
6554 gcc_assert (pow2p_hwi (group_size
));
6556 gimple_seq seq
= NULL
;
6558 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6559 and the same element size as VECTYPE. */
6560 tree index
= build_index_vector (vectype
, 0, 1);
6561 tree index_type
= TREE_TYPE (index
);
6562 tree index_elt_type
= TREE_TYPE (index_type
);
6563 tree mask_type
= truth_type_for (index_type
);
6565 /* Create a vector that, for each element, identifies which of
6566 the REDUC_GROUP_SIZE results should use it. */
6567 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
6568 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
6569 build_vector_from_val (index_type
, index_mask
));
6571 /* Get a neutral vector value. This is simply a splat of the neutral
6572 scalar value if we have one, otherwise the initial scalar value
6573 is itself a neutral value. */
6574 tree vector_identity
= NULL_TREE
;
6575 tree neutral_op
= NULL_TREE
;
6578 tree initial_value
= NULL_TREE
;
6579 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6580 initial_value
= reduc_info
->reduc_initial_values
[0];
6581 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype
), code
,
6582 initial_value
, false);
6585 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6587 for (unsigned int i
= 0; i
< group_size
; ++i
)
6589 /* If there's no univeral neutral value, we can use the
6590 initial scalar value from the original PHI. This is used
6591 for MIN and MAX reduction, for example. */
6594 tree scalar_value
= reduc_info
->reduc_initial_values
[i
];
6595 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
6597 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6601 /* Calculate the equivalent of:
6603 sel[j] = (index[j] == i);
6605 which selects the elements of REDUC_INPUTS[0] that should
6606 be included in the result. */
6607 tree compare_val
= build_int_cst (index_elt_type
, i
);
6608 compare_val
= build_vector_from_val (index_type
, compare_val
);
6609 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
6610 index
, compare_val
);
6612 /* Calculate the equivalent of:
6614 vec = seq ? reduc_inputs[0] : vector_identity;
6616 VEC is now suitable for a full vector reduction. */
6617 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
6618 sel
, reduc_inputs
[0], vector_identity
);
6620 /* Do the reduction and convert it to the appropriate type. */
6621 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
6622 TREE_TYPE (vectype
), vec
);
6623 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
6624 scalar_results
.safe_push (scalar
);
6626 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
6630 bool reduce_with_shift
;
6633 gcc_assert (slp_reduc
|| reduc_inputs
.length () == 1);
6635 /* See if the target wants to do the final (shift) reduction
6636 in a vector mode of smaller size and first reduce upper/lower
6637 halves against each other. */
6638 enum machine_mode mode1
= mode
;
6639 tree stype
= TREE_TYPE (vectype
);
6640 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6641 unsigned nunits1
= nunits
;
6642 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
6643 && reduc_inputs
.length () == 1)
6645 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6646 /* For SLP reductions we have to make sure lanes match up, but
6647 since we're doing individual element final reduction reducing
6648 vector width here is even more important.
6649 ??? We can also separate lanes with permutes, for the common
6650 case of power-of-two group-size odd/even extracts would work. */
6651 if (slp_reduc
&& nunits
!= nunits1
)
6653 nunits1
= least_common_multiple (nunits1
, group_size
);
6654 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
6658 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
6659 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6661 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
6663 reduce_with_shift
= have_whole_vector_shift (mode1
);
6664 if (!VECTOR_MODE_P (mode1
)
6665 || !directly_supported_p (code
, vectype1
))
6666 reduce_with_shift
= false;
6668 /* First reduce the vector to the desired vector size we should
6669 do shift reduction on by combining upper and lower halves. */
6670 gimple_seq stmts
= NULL
;
6671 new_temp
= vect_create_partial_epilog (reduc_inputs
[0], vectype1
,
6673 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6674 reduc_inputs
[0] = new_temp
;
6676 if (reduce_with_shift
&& (!slp_reduc
|| group_size
== 1))
6678 int element_bitsize
= tree_to_uhwi (bitsize
);
6679 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6680 for variable-length vectors and also requires direct target support
6681 for loop reductions. */
6682 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6683 int nelements
= vec_size_in_bits
/ element_bitsize
;
6684 vec_perm_builder sel
;
6685 vec_perm_indices indices
;
6689 tree zero_vec
= build_zero_cst (vectype1
);
6691 for (offset = nelements/2; offset >= 1; offset/=2)
6693 Create: va' = vec_shift <va, offset>
6694 Create: va = vop <va, va'>
6699 if (dump_enabled_p ())
6700 dump_printf_loc (MSG_NOTE
, vect_location
,
6701 "Reduce using vector shifts\n");
6703 gimple_seq stmts
= NULL
;
6704 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
6705 for (elt_offset
= nelements
/ 2;
6709 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
6710 indices
.new_vector (sel
, 2, nelements
);
6711 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
6712 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
6713 new_temp
, zero_vec
, mask
);
6714 new_temp
= gimple_build (&stmts
, code
,
6715 vectype1
, new_name
, new_temp
);
6717 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6719 /* 2.4 Extract the final scalar result. Create:
6720 s_out3 = extract_field <v_out2, bitpos> */
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE
, vect_location
,
6724 "extract scalar result\n");
6726 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
6727 bitsize
, bitsize_zero_node
);
6728 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
6729 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
6730 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
6731 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6732 scalar_results
.safe_push (new_temp
);
6737 s = extract_field <v_out2, 0>
6738 for (offset = element_size;
6739 offset < vector_size;
6740 offset += element_size;)
6742 Create: s' = extract_field <v_out2, offset>
6743 Create: s = op <s, s'> // For non SLP cases
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_NOTE
, vect_location
,
6748 "Reduce using scalar code.\n");
6750 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6751 int element_bitsize
= tree_to_uhwi (bitsize
);
6752 tree compute_type
= TREE_TYPE (vectype
);
6753 gimple_seq stmts
= NULL
;
6754 FOR_EACH_VEC_ELT (reduc_inputs
, i
, vec_temp
)
6757 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
6758 vec_temp
, bitsize
, bitsize_zero_node
);
6760 /* In SLP we don't need to apply reduction operation, so we just
6761 collect s' values in SCALAR_RESULTS. */
6763 scalar_results
.safe_push (new_temp
);
6765 for (bit_offset
= element_bitsize
;
6766 bit_offset
< vec_size_in_bits
;
6767 bit_offset
+= element_bitsize
)
6769 tree bitpos
= bitsize_int (bit_offset
);
6770 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
6771 compute_type
, vec_temp
,
6775 /* In SLP we don't need to apply reduction operation, so
6776 we just collect s' values in SCALAR_RESULTS. */
6777 new_temp
= new_name
;
6778 scalar_results
.safe_push (new_name
);
6781 new_temp
= gimple_build (&stmts
, code
, compute_type
,
6782 new_name
, new_temp
);
6786 /* The only case where we need to reduce scalar results in SLP, is
6787 unrolling. If the size of SCALAR_RESULTS is greater than
6788 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6789 REDUC_GROUP_SIZE. */
6792 tree res
, first_res
, new_res
;
6794 /* Reduce multiple scalar results in case of SLP unrolling. */
6795 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
6798 first_res
= scalar_results
[j
% group_size
];
6799 new_res
= gimple_build (&stmts
, code
, compute_type
,
6801 scalar_results
[j
% group_size
] = new_res
;
6803 scalar_results
.truncate (group_size
);
6804 for (k
= 0; k
< group_size
; k
++)
6805 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
6810 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6811 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6812 scalar_results
.safe_push (new_temp
);
6815 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6818 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6821 /* Earlier we set the initial value to be a vector if induc_val
6822 values. Check the result and if it is induc_val then replace
6823 with the original initial value, unless induc_val is
6824 the same as initial_def already. */
6825 tree zcompare
= make_ssa_name (boolean_type_node
);
6826 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
,
6827 scalar_results
[0], induc_val
);
6828 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6829 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6830 tree tmp
= make_ssa_name (new_scalar_dest
);
6831 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6832 initial_def
, scalar_results
[0]);
6833 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6834 scalar_results
[0] = tmp
;
6838 /* 2.5 Adjust the final result by the initial value of the reduction
6839 variable. (When such adjustment is not needed, then
6840 'adjustment_def' is zero). For example, if code is PLUS we create:
6841 new_temp = loop_exit_def + adjustment_def */
6845 gcc_assert (!slp_reduc
|| group_size
== 1);
6846 gimple_seq stmts
= NULL
;
6849 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
6850 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
6851 new_temp
= gimple_build (&stmts
, code
, vectype
,
6852 reduc_inputs
[0], adjustment_def
);
6856 new_temp
= scalar_results
[0];
6857 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
6858 adjustment_def
= gimple_convert (&stmts
, TREE_TYPE (vectype
),
6860 new_temp
= gimple_convert (&stmts
, TREE_TYPE (vectype
), new_temp
);
6861 new_temp
= gimple_build (&stmts
, code
, TREE_TYPE (vectype
),
6862 new_temp
, adjustment_def
);
6863 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6866 epilog_stmt
= gimple_seq_last_stmt (stmts
);
6867 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6868 scalar_results
[0] = new_temp
;
6871 /* Record this operation if it could be reused by the epilogue loop. */
6872 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
6873 && reduc_inputs
.length () == 1)
6874 loop_vinfo
->reusable_accumulators
.put (scalar_results
[0],
6875 { orig_reduc_input
, reduc_info
});
6880 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6881 phis with new adjusted scalar results, i.e., replace use <s_out0>
6886 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6887 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6888 v_out2 = reduce <v_out1>
6889 s_out3 = extract_field <v_out2, 0>
6890 s_out4 = adjust_result <s_out3>
6897 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6898 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6899 v_out2 = reduce <v_out1>
6900 s_out3 = extract_field <v_out2, 0>
6901 s_out4 = adjust_result <s_out3>
6905 gcc_assert (live_out_stmts
.size () == scalar_results
.length ());
6906 auto_vec
<gimple
*> phis
;
6907 for (k
= 0; k
< live_out_stmts
.size (); k
++)
6909 stmt_vec_info scalar_stmt_info
= vect_orig_stmt (live_out_stmts
[k
]);
6910 scalar_dest
= gimple_get_lhs (scalar_stmt_info
->stmt
);
6912 /* Find the loop-closed-use at the loop exit of the original scalar
6913 result. (The reduction result is expected to have two immediate uses,
6914 one at the latch block, and one at the loop exit). For double
6915 reductions we are looking for exit phis of the outer loop. */
6916 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6918 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6920 if (!is_gimple_debug (USE_STMT (use_p
))
6921 && gimple_bb (USE_STMT (use_p
)) == loop_exit
->dest
)
6922 phis
.safe_push (USE_STMT (use_p
));
6926 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6928 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6930 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6932 if (!flow_bb_inside_loop_p (loop
,
6933 gimple_bb (USE_STMT (phi_use_p
)))
6934 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6935 phis
.safe_push (USE_STMT (phi_use_p
));
6941 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6943 /* Replace the uses: */
6944 orig_name
= PHI_RESULT (exit_phi
);
6946 /* Look for a single use at the target of the skip edge. */
6947 if (unify_with_main_loop_p
)
6949 use_operand_p use_p
;
6951 if (!single_imm_use (orig_name
, &use_p
, &user
))
6953 orig_name
= gimple_get_lhs (user
);
6956 scalar_result
= scalar_results
[k
];
6957 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6959 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6960 SET_USE (use_p
, scalar_result
);
6961 update_stmt (use_stmt
);
6969 /* Return a vector of type VECTYPE that is equal to the vector select
6970 operation "MASK ? VEC : IDENTITY". Insert the select statements
6974 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6975 tree vec
, tree identity
)
6977 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6978 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6979 mask
, vec
, identity
);
6980 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6984 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6985 order, starting with LHS. Insert the extraction statements before GSI and
6986 associate the new scalar SSA names with variable SCALAR_DEST.
6987 If MASK is nonzero mask the input and then operate on it unconditionally.
6988 Return the SSA name for the result. */
6991 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6992 tree_code code
, tree lhs
, tree vector_rhs
,
6995 tree vectype
= TREE_TYPE (vector_rhs
);
6996 tree scalar_type
= TREE_TYPE (vectype
);
6997 tree bitsize
= TYPE_SIZE (scalar_type
);
6998 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6999 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
7001 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7002 to perform an unconditional element-wise reduction of it. */
7005 tree masked_vector_rhs
= make_temp_ssa_name (vectype
, NULL
,
7006 "masked_vector_rhs");
7007 tree neutral_op
= neutral_op_for_reduction (scalar_type
, code
, NULL_TREE
,
7009 tree vector_identity
= build_vector_from_val (vectype
, neutral_op
);
7010 gassign
*select
= gimple_build_assign (masked_vector_rhs
, VEC_COND_EXPR
,
7011 mask
, vector_rhs
, vector_identity
);
7012 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7013 vector_rhs
= masked_vector_rhs
;
7016 for (unsigned HOST_WIDE_INT bit_offset
= 0;
7017 bit_offset
< vec_size_in_bits
;
7018 bit_offset
+= element_bitsize
)
7020 tree bitpos
= bitsize_int (bit_offset
);
7021 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
7024 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
7025 rhs
= make_ssa_name (scalar_dest
, stmt
);
7026 gimple_assign_set_lhs (stmt
, rhs
);
7027 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
7029 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
7030 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
7031 gimple_assign_set_lhs (stmt
, new_name
);
7032 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
7038 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7039 type of the vector input. */
7042 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
7044 internal_fn mask_reduc_fn
;
7045 internal_fn mask_len_reduc_fn
;
7049 case IFN_FOLD_LEFT_PLUS
:
7050 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
7051 mask_len_reduc_fn
= IFN_MASK_LEN_FOLD_LEFT_PLUS
;
7058 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
7059 OPTIMIZE_FOR_SPEED
))
7060 return mask_reduc_fn
;
7061 if (direct_internal_fn_supported_p (mask_len_reduc_fn
, vectype_in
,
7062 OPTIMIZE_FOR_SPEED
))
7063 return mask_len_reduc_fn
;
7067 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7068 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7069 statement. CODE is the operation performed by STMT_INFO and OPS are
7070 its scalar operands. REDUC_INDEX is the index of the operand in
7071 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7072 implements in-order reduction, or IFN_LAST if we should open-code it.
7073 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7074 that should be used to control the operation in a fully-masked loop. */
7077 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
7078 stmt_vec_info stmt_info
,
7079 gimple_stmt_iterator
*gsi
,
7080 gimple
**vec_stmt
, slp_tree slp_node
,
7081 gimple
*reduc_def_stmt
,
7082 code_helper code
, internal_fn reduc_fn
,
7083 tree
*ops
, int num_ops
, tree vectype_in
,
7084 int reduc_index
, vec_loop_masks
*masks
,
7085 vec_loop_lens
*lens
)
7087 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7088 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7089 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
7095 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7097 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
7098 gcc_assert (ncopies
== 1);
7100 bool is_cond_op
= false;
7101 if (!code
.is_tree_code ())
7103 code
= conditional_internal_fn_code (internal_fn (code
));
7104 gcc_assert (code
!= ERROR_MARK
);
7108 gcc_assert (TREE_CODE_LENGTH (tree_code (code
)) == binary_op
);
7111 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
7112 TYPE_VECTOR_SUBPARTS (vectype_in
)));
7114 /* The operands either come from a binary operation or an IFN_COND operation.
7115 The former is a gimple assign with binary rhs and the latter is a
7116 gimple call with four arguments. */
7117 gcc_assert (num_ops
== 2 || num_ops
== 4);
7120 stmt_vec_info scalar_dest_def_info
;
7121 auto_vec
<tree
> vec_oprnds0
, vec_opmask
;
7124 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[(is_cond_op
? 2 : 0)
7125 + (1 - reduc_index
)],
7127 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7128 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
7129 /* For an IFN_COND_OP we also need the vector mask operand. */
7131 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[0], &vec_opmask
);
7137 op0
= ops
[1 - reduc_index
];
7140 op0
= ops
[2 + (1 - reduc_index
)];
7143 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7145 scalar_dest_def_info
= stmt_info
;
7147 /* For an IFN_COND_OP we also need the vector mask operand. */
7149 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7150 opmask
, &vec_opmask
);
7153 gimple
*sdef
= vect_orig_stmt (scalar_dest_def_info
)->stmt
;
7154 tree scalar_dest
= gimple_get_lhs (sdef
);
7155 tree scalar_type
= TREE_TYPE (scalar_dest
);
7156 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
7158 int vec_num
= vec_oprnds0
.length ();
7159 gcc_assert (vec_num
== 1 || slp_node
);
7160 tree vec_elem_type
= TREE_TYPE (vectype_out
);
7161 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
7163 tree vector_identity
= NULL_TREE
;
7164 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
7166 vector_identity
= build_zero_cst (vectype_out
);
7167 if (!HONOR_SIGNED_ZEROS (vectype_out
))
7171 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out
));
7172 vector_identity
= const_unop (NEGATE_EXPR
, vectype_out
,
7177 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
7180 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7183 tree mask
= NULL_TREE
;
7184 tree len
= NULL_TREE
;
7185 tree bias
= NULL_TREE
;
7186 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
7188 tree loop_mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
,
7189 vec_num
, vectype_in
, i
);
7191 mask
= prepare_vec_mask (loop_vinfo
, TREE_TYPE (loop_mask
),
7192 loop_mask
, vec_opmask
[i
], gsi
);
7196 else if (is_cond_op
)
7197 mask
= vec_opmask
[i
];
7198 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
7200 len
= vect_get_loop_len (loop_vinfo
, gsi
, lens
, vec_num
, vectype_in
,
7202 signed char biasval
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
7203 bias
= build_int_cst (intQI_type_node
, biasval
);
7205 mask
= build_minus_one_cst (truth_type_for (vectype_in
));
7208 /* Handle MINUS by adding the negative. */
7209 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
7211 tree negated
= make_ssa_name (vectype_out
);
7212 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
7213 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
7217 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
7218 && mask
&& mask_reduc_fn
== IFN_LAST
)
7219 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
7222 /* On the first iteration the input is simply the scalar phi
7223 result, and for subsequent iterations it is the output of
7224 the preceding operation. */
7225 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
7227 if (mask
&& len
&& mask_reduc_fn
== IFN_MASK_LEN_FOLD_LEFT_PLUS
)
7228 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 5, reduc_var
,
7229 def0
, mask
, len
, bias
);
7230 else if (mask
&& mask_reduc_fn
== IFN_MASK_FOLD_LEFT_PLUS
)
7231 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
7234 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
7236 /* For chained SLP reductions the output of the previous reduction
7237 operation serves as the input of the next. For the final statement
7238 the output cannot be a temporary - we reuse the original
7239 scalar destination of the last statement. */
7240 if (i
!= vec_num
- 1)
7242 gimple_set_lhs (new_stmt
, scalar_dest_var
);
7243 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
7244 gimple_set_lhs (new_stmt
, reduc_var
);
7249 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
,
7250 tree_code (code
), reduc_var
, def0
,
7252 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
7253 /* Remove the statement, so that we can use the same code paths
7254 as for statements that we've just created. */
7255 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
7256 gsi_remove (&tmp_gsi
, true);
7259 if (i
== vec_num
- 1)
7261 gimple_set_lhs (new_stmt
, scalar_dest
);
7262 vect_finish_replace_stmt (loop_vinfo
,
7263 scalar_dest_def_info
,
7267 vect_finish_stmt_generation (loop_vinfo
,
7268 scalar_dest_def_info
,
7272 slp_node
->push_vec_def (new_stmt
);
7275 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7276 *vec_stmt
= new_stmt
;
7283 /* Function is_nonwrapping_integer_induction.
7285 Check if STMT_VINO (which is part of loop LOOP) both increments and
7286 does not cause overflow. */
7289 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
7291 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
7292 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
7293 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
7294 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
7295 widest_int ni
, max_loop_value
, lhs_max
;
7296 wi::overflow_type overflow
= wi::OVF_NONE
;
7298 /* Make sure the loop is integer based. */
7299 if (TREE_CODE (base
) != INTEGER_CST
7300 || TREE_CODE (step
) != INTEGER_CST
)
7303 /* Check that the max size of the loop will not wrap. */
7305 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
7308 if (! max_stmt_executions (loop
, &ni
))
7311 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
7316 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
7317 TYPE_SIGN (lhs_type
), &overflow
);
7321 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
7322 <= TYPE_PRECISION (lhs_type
));
7325 /* Check if masking can be supported by inserting a conditional expression.
7326 CODE is the code for the operation. COND_FN is the conditional internal
7327 function, if it exists. VECTYPE_IN is the type of the vector input. */
7329 use_mask_by_cond_expr_p (code_helper code
, internal_fn cond_fn
,
7332 if (cond_fn
!= IFN_LAST
7333 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7334 OPTIMIZE_FOR_SPEED
))
7337 if (code
.is_tree_code ())
7338 switch (tree_code (code
))
7350 /* Insert a conditional expression to enable masked vectorization. CODE is the
7351 code for the operation. VOP is the array of operands. MASK is the loop
7352 mask. GSI is a statement iterator used to place the new conditional
7355 build_vect_cond_expr (code_helper code
, tree vop
[3], tree mask
,
7356 gimple_stmt_iterator
*gsi
)
7358 switch (tree_code (code
))
7362 tree vectype
= TREE_TYPE (vop
[1]);
7363 tree zero
= build_zero_cst (vectype
);
7364 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
7365 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
7366 mask
, vop
[1], zero
);
7367 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7368 vop
[1] = masked_op1
;
7374 tree vectype
= TREE_TYPE (vop
[1]);
7375 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
7376 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
7377 mask
, vop
[1], vop
[0]);
7378 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
7379 vop
[1] = masked_op1
;
7388 /* Given an operation with CODE in loop reduction path whose reduction PHI is
7389 specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7390 input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7391 may be different from VECTYPE_IN, either in base type or vectype lanes,
7392 lane-reducing operation is the case. This function check if it is possible,
7393 and how to perform partial vectorization on the operation in the context
7397 vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo
,
7398 stmt_vec_info reduc_info
,
7400 code_helper code
, tree type
,
7403 enum vect_reduction_type reduc_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7404 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7405 internal_fn cond_fn
= get_conditional_internal_fn (code
, type
);
7407 if (reduc_type
!= FOLD_LEFT_REDUCTION
7408 && !use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
)
7409 && (cond_fn
== IFN_LAST
7410 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7411 OPTIMIZE_FOR_SPEED
)))
7413 if (dump_enabled_p ())
7414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7415 "can't operate on partial vectors because"
7416 " no conditional operation is available.\n");
7417 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7419 else if (reduc_type
== FOLD_LEFT_REDUCTION
7420 && reduc_fn
== IFN_LAST
7421 && !expand_vec_cond_expr_p (vectype_in
, truth_type_for (vectype_in
),
7424 if (dump_enabled_p ())
7425 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7426 "can't operate on partial vectors because"
7427 " no conditional operation is available.\n");
7428 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7430 else if (reduc_type
== FOLD_LEFT_REDUCTION
7431 && internal_fn_mask_index (reduc_fn
) == -1
7432 && FLOAT_TYPE_P (vectype_in
)
7433 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in
))
7435 if (dump_enabled_p ())
7436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7437 "can't operate on partial vectors because"
7438 " signed zeros cannot be preserved.\n");
7439 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7443 internal_fn mask_reduc_fn
7444 = get_masked_reduction_fn (reduc_fn
, vectype_in
);
7445 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7446 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
7447 unsigned nvectors
= vect_get_num_copies (loop_vinfo
, slp_node
,
7450 if (mask_reduc_fn
== IFN_MASK_LEN_FOLD_LEFT_PLUS
)
7451 vect_record_loop_len (loop_vinfo
, lens
, nvectors
, vectype_in
, 1);
7453 vect_record_loop_mask (loop_vinfo
, masks
, nvectors
, vectype_in
, NULL
);
7457 /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7458 the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7459 and the analysis is for slp if SLP_NODE is not NULL.
7461 For a lane-reducing operation, the loop reduction path that it lies in,
7462 may contain normal operation, or other lane-reducing operation of different
7463 input type size, an example as:
7469 sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7470 sum += w[i]; // widen-sum <vector(16) char>
7471 sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7472 sum += n[i]; // normal <vector(4) int>
7476 Vectorization factor is essentially determined by operation whose input
7477 vectype has the most lanes ("vector(16) char" in the example), while we
7478 need to choose input vectype with the least lanes ("vector(4) int" in the
7479 example) to determine effective number of vector reduction PHIs. */
7482 vectorizable_lane_reducing (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
7483 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
7485 gimple
*stmt
= stmt_info
->stmt
;
7487 if (!lane_reducing_stmt_p (stmt
))
7490 tree type
= TREE_TYPE (gimple_assign_lhs (stmt
));
7492 if (!INTEGRAL_TYPE_P (type
))
7495 /* Do not try to vectorize bit-precision reductions. */
7496 if (!type_has_mode_precision_p (type
))
7499 stmt_vec_info reduc_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7501 /* TODO: Support lane-reducing operation that does not directly participate
7502 in loop reduction. */
7503 if (!reduc_info
|| STMT_VINFO_REDUC_IDX (stmt_info
) < 0)
7506 /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7508 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_reduction_def
);
7509 gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
);
7511 for (int i
= 0; i
< (int) gimple_num_ops (stmt
) - 1; i
++)
7513 stmt_vec_info def_stmt_info
;
7517 enum vect_def_type dt
;
7519 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_node
, i
, &op
,
7520 &slp_op
, &dt
, &vectype
, &def_stmt_info
))
7522 if (dump_enabled_p ())
7523 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7524 "use not simple.\n");
7530 vectype
= get_vectype_for_scalar_type (loop_vinfo
, TREE_TYPE (op
),
7536 if (slp_node
&& !vect_maybe_update_slp_op_vectype (slp_op
, vectype
))
7538 if (dump_enabled_p ())
7539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7540 "incompatible vector types for invariants\n");
7544 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
7547 /* There should be at most one cycle def in the stmt. */
7548 if (VECTORIZABLE_CYCLE_DEF (dt
))
7552 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (stmt_info
);
7554 gcc_assert (vectype_in
);
7556 /* Compute number of effective vector statements for costing. */
7557 unsigned int ncopies_for_cost
= vect_get_num_copies (loop_vinfo
, slp_node
,
7559 gcc_assert (ncopies_for_cost
>= 1);
7561 if (vect_is_emulated_mixed_dot_prod (stmt_info
))
7563 /* We need extra two invariants: one that contains the minimum signed
7564 value and one that contains half of its negative. */
7565 int prologue_stmts
= 2;
7566 unsigned cost
= record_stmt_cost (cost_vec
, prologue_stmts
,
7567 scalar_to_vec
, stmt_info
, 0,
7569 if (dump_enabled_p ())
7570 dump_printf (MSG_NOTE
, "vectorizable_lane_reducing: "
7571 "extra prologue_cost = %d .\n", cost
);
7573 /* Three dot-products and a subtraction. */
7574 ncopies_for_cost
*= 4;
7577 record_stmt_cost (cost_vec
, (int) ncopies_for_cost
, vector_stmt
, stmt_info
,
7580 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7582 enum tree_code code
= gimple_assign_rhs_code (stmt
);
7583 vect_reduction_update_partial_vector_usage (loop_vinfo
, reduc_info
,
7584 slp_node
, code
, type
,
7588 /* Transform via vect_transform_reduction. */
7589 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7593 /* Function vectorizable_reduction.
7595 Check if STMT_INFO performs a reduction operation that can be vectorized.
7596 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7597 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7598 Return true if STMT_INFO is vectorizable in this way.
7600 This function also handles reduction idioms (patterns) that have been
7601 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7602 may be of this form:
7603 X = pattern_expr (arg0, arg1, ..., X)
7604 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7605 sequence that had been detected and replaced by the pattern-stmt
7608 This function also handles reduction of condition expressions, for example:
7609 for (int i = 0; i < N; i++)
7612 This is handled by vectorising the loop and creating an additional vector
7613 containing the loop indexes for which "a[i] < value" was true. In the
7614 function epilogue this is reduced to a single max value and then used to
7615 index into the vector of results.
7617 In some cases of reduction patterns, the type of the reduction variable X is
7618 different than the type of the other arguments of STMT_INFO.
7619 In such cases, the vectype that is used when transforming STMT_INFO into
7620 a vector stmt is different than the vectype that is used to determine the
7621 vectorization factor, because it consists of a different number of elements
7622 than the actual number of elements that are being operated upon in parallel.
7624 For example, consider an accumulation of shorts into an int accumulator.
7625 On some targets it's possible to vectorize this pattern operating on 8
7626 shorts at a time (hence, the vectype for purposes of determining the
7627 vectorization factor should be V8HI); on the other hand, the vectype that
7628 is used to create the vector form is actually V4SI (the type of the result).
7630 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7631 indicates what is the actual level of parallelism (V8HI in the example), so
7632 that the right vectorization factor would be derived. This vectype
7633 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7634 be used to create the vectorized stmt. The right vectype for the vectorized
7635 stmt is obtained from the type of the result X:
7636 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7638 This means that, contrary to "regular" reductions (or "regular" stmts in
7639 general), the following equation:
7640 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7641 does *NOT* necessarily hold for reduction patterns. */
7644 vectorizable_reduction (loop_vec_info loop_vinfo
,
7645 stmt_vec_info stmt_info
, slp_tree slp_node
,
7646 slp_instance slp_node_instance
,
7647 stmt_vector_for_cost
*cost_vec
)
7649 tree vectype_in
= NULL_TREE
;
7650 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7651 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
7652 stmt_vec_info cond_stmt_vinfo
= NULL
;
7655 bool single_defuse_cycle
= false;
7656 bool nested_cycle
= false;
7657 bool double_reduc
= false;
7658 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
7659 tree cond_reduc_val
= NULL_TREE
;
7661 /* Make sure it was already recognized as a reduction computation. */
7662 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
7663 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
7664 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
7667 /* The stmt we store reduction analysis meta on. */
7668 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7669 reduc_info
->is_reduc_info
= true;
7671 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
7673 if (is_a
<gphi
*> (stmt_info
->stmt
))
7677 /* We eventually need to set a vector type on invariant
7681 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
7682 if (!vect_maybe_update_slp_op_vectype
7683 (child
, SLP_TREE_VECTYPE (slp_node
)))
7685 if (dump_enabled_p ())
7686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7687 "incompatible vector types for "
7692 /* Analysis for double-reduction is done on the outer
7693 loop PHI, nested cycles have no further restrictions. */
7694 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
7697 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7701 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
7702 stmt_vec_info phi_info
= stmt_info
;
7703 if (!is_a
<gphi
*> (stmt_info
->stmt
))
7705 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7708 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
7710 if (gimple_bb (stmt_info
->stmt
) != loop
->header
)
7712 /* For SLP we arrive here for both the inner loop LC PHI and
7713 the outer loop PHI. The latter is what we want to analyze
7714 the reduction with. */
7715 gcc_assert (slp_node
);
7718 use_operand_p use_p
;
7720 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
7723 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
7728 slp_node_instance
->reduc_phis
= slp_node
;
7729 /* ??? We're leaving slp_node to point to the PHIs, we only
7730 need it to get at the number of vector stmts which wasn't
7731 yet initialized for the instance root. */
7734 /* PHIs should not participate in patterns. */
7735 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
7736 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7738 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7739 and compute the reduction chain length. Discover the real
7740 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7742 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
7744 (gimple_bb (reduc_def_phi
)->loop_father
));
7745 unsigned reduc_chain_length
= 0;
7746 bool only_slp_reduc_chain
= true;
7748 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
7749 /* For double-reductions we start SLP analysis at the inner loop LC PHI
7750 which is the def of the outer loop live stmt. */
7751 if (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
7753 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
7754 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
7756 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
7757 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
7758 int reduc_idx
= STMT_VINFO_REDUC_IDX (vdef
);
7760 if (reduc_idx
== -1)
7762 if (dump_enabled_p ())
7763 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7764 "reduction chain broken by patterns.\n");
7767 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
7768 only_slp_reduc_chain
= false;
7769 /* For epilogue generation live members of the chain need
7770 to point back to the PHI via their original stmt for
7771 info_for_reduction to work. For SLP we need to look at
7772 all lanes here - even though we only will vectorize from
7773 the SLP node with live lane zero the other live lanes also
7774 need to be identified as part of a reduction to be able
7775 to skip code generation for them. */
7776 if (slp_for_stmt_info
)
7778 for (auto s
: SLP_TREE_SCALAR_STMTS (slp_for_stmt_info
))
7779 if (STMT_VINFO_LIVE_P (s
))
7780 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s
)) = phi_info
;
7782 else if (STMT_VINFO_LIVE_P (vdef
))
7783 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
7785 if (!gimple_extract_op (vdef
->stmt
, &op
))
7787 if (dump_enabled_p ())
7788 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7789 "reduction chain includes unsupported"
7790 " statement type.\n");
7793 if (CONVERT_EXPR_CODE_P (op
.code
))
7795 if (!tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
7797 if (dump_enabled_p ())
7798 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7799 "conversion in the reduction chain.\n");
7805 /* First non-conversion stmt. */
7809 if (lane_reducing_op_p (op
.code
))
7811 enum vect_def_type dt
;
7814 /* The last operand of lane-reducing operation is for
7816 gcc_assert (reduc_idx
> 0 && reduc_idx
== (int) op
.num_ops
- 1);
7818 if (!vect_is_simple_use (op
.ops
[0], loop_vinfo
, &dt
, &vectype_op
))
7821 tree type_op
= TREE_TYPE (op
.ops
[0]);
7825 vectype_op
= get_vectype_for_scalar_type (loop_vinfo
,
7831 /* For lane-reducing operation vectorizable analysis needs the
7832 reduction PHI information. */
7833 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
7835 /* Each lane-reducing operation has its own input vectype, while
7836 reduction PHI will record the input vectype with the least
7838 STMT_VINFO_REDUC_VECTYPE_IN (vdef
) = vectype_op
;
7840 /* To accommodate lane-reducing operations of mixed input
7841 vectypes, choose input vectype with the least lanes for the
7842 reduction PHI statement, which would result in the most
7843 ncopies for vectorized reduction results. */
7845 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
7846 < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op
))))
7847 vectype_in
= vectype_op
;
7850 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
7853 reduc_def
= op
.ops
[reduc_idx
];
7854 reduc_chain_length
++;
7855 if (!stmt_info
&& slp_node
)
7856 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
7858 /* PHIs should not participate in patterns. */
7859 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
7861 if (nested_in_vect_loop_p (loop
, stmt_info
))
7864 nested_cycle
= true;
7867 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7869 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7871 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
7872 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
7874 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7875 gcc_assert (slp_node
7876 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
7878 /* 1. Is vectorizable reduction? */
7879 /* Not supportable if the reduction variable is used in the loop, unless
7880 it's a reduction chain. */
7881 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
7882 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7885 /* Reductions that are not used even in an enclosing outer-loop,
7886 are expected to be "live" (used out of the loop). */
7887 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
7888 && !STMT_VINFO_LIVE_P (stmt_info
))
7891 /* 2. Has this been recognized as a reduction pattern?
7893 Check if STMT represents a pattern that has been recognized
7894 in earlier analysis stages. For stmts that represent a pattern,
7895 the STMT_VINFO_RELATED_STMT field records the last stmt in
7896 the original sequence that constitutes the pattern. */
7898 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
7901 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
7902 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
7905 /* 3. Check the operands of the operation. The first operands are defined
7906 inside the loop body. The last operand is the reduction variable,
7907 which is defined by the loop-header-phi. */
7909 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7910 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
7911 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
7914 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7916 bool lane_reducing
= lane_reducing_op_p (op
.code
);
7918 if (!POINTER_TYPE_P (op
.type
) && !INTEGRAL_TYPE_P (op
.type
)
7919 && !SCALAR_FLOAT_TYPE_P (op
.type
))
7922 /* Do not try to vectorize bit-precision reductions. */
7923 if (!type_has_mode_precision_p (op
.type
))
7926 /* Lane-reducing ops also never can be used in a SLP reduction group
7927 since we'll mix lanes belonging to different reductions. But it's
7928 OK to use them in a reduction chain or when the reduction group
7929 has just one element. */
7932 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7933 && SLP_TREE_LANES (slp_node
) > 1)
7935 if (dump_enabled_p ())
7936 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7937 "lane-reducing reduction in reduction group.\n");
7941 /* All uses but the last are expected to be defined in the loop.
7942 The last use is the reduction variable. In case of nested cycle this
7943 assumption is not true: we use reduc_index to record the index of the
7944 reduction variable. */
7945 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op
.num_ops
);
7946 tree
*vectype_op
= XALLOCAVEC (tree
, op
.num_ops
);
7947 /* We need to skip an extra operand for COND_EXPRs with embedded
7949 unsigned opno_adjust
= 0;
7950 if (op
.code
== COND_EXPR
&& COMPARISON_CLASS_P (op
.ops
[0]))
7952 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7954 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7955 if (i
== 0 && op
.code
== COND_EXPR
)
7958 stmt_vec_info def_stmt_info
;
7959 enum vect_def_type dt
;
7960 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
7961 i
+ opno_adjust
, &op
.ops
[i
], &slp_op
[i
], &dt
,
7962 &vectype_op
[i
], &def_stmt_info
))
7964 if (dump_enabled_p ())
7965 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7966 "use not simple.\n");
7970 /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7971 reduction operand twice (once as definition, once as else). */
7972 if (op
.ops
[i
] == op
.ops
[STMT_VINFO_REDUC_IDX (stmt_info
)])
7975 /* There should be only one cycle def in the stmt, the one
7976 leading to reduc_def. */
7977 if (VECTORIZABLE_CYCLE_DEF (dt
))
7982 = get_vectype_for_scalar_type (loop_vinfo
,
7983 TREE_TYPE (op
.ops
[i
]), slp_op
[i
]);
7985 /* Record how the non-reduction-def value of COND_EXPR is defined.
7986 ??? For a chain of multiple CONDs we'd have to match them up all. */
7987 if (op
.code
== COND_EXPR
&& reduc_chain_length
== 1)
7989 if (dt
== vect_constant_def
)
7992 cond_reduc_val
= op
.ops
[i
];
7994 else if (dt
== vect_induction_def
7996 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
7999 cond_stmt_vinfo
= def_stmt_info
;
8004 enum vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
8005 STMT_VINFO_REDUC_TYPE (reduc_info
) = reduction_type
;
8006 /* If we have a condition reduction, see if we can simplify it further. */
8007 if (reduction_type
== COND_REDUCTION
)
8009 if (slp_node
&& SLP_TREE_LANES (slp_node
) != 1)
8012 /* When the condition uses the reduction value in the condition, fail. */
8013 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
8015 if (dump_enabled_p ())
8016 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8017 "condition depends on previous iteration\n");
8021 if (reduc_chain_length
== 1
8022 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
, vectype_in
,
8024 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST
,
8026 OPTIMIZE_FOR_SPEED
)))
8028 if (dump_enabled_p ())
8029 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8030 "optimizing condition reduction with"
8031 " FOLD_EXTRACT_LAST.\n");
8032 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
8034 else if (cond_reduc_dt
== vect_induction_def
)
8037 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
8038 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
8040 gcc_assert (TREE_CODE (base
) == INTEGER_CST
8041 && TREE_CODE (step
) == INTEGER_CST
);
8042 cond_reduc_val
= NULL_TREE
;
8043 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
8044 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
8045 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
8047 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
8048 above base; punt if base is the minimum value of the type for
8049 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
8050 else if (tree_int_cst_sgn (step
) == -1)
8052 cond_reduc_op_code
= MIN_EXPR
;
8053 if (tree_int_cst_sgn (base
) == -1)
8054 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
8055 else if (tree_int_cst_lt (base
,
8056 TYPE_MAX_VALUE (TREE_TYPE (base
))))
8058 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
8062 cond_reduc_op_code
= MAX_EXPR
;
8063 if (tree_int_cst_sgn (base
) == 1)
8064 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
8065 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
8068 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
8072 if (dump_enabled_p ())
8073 dump_printf_loc (MSG_NOTE
, vect_location
,
8074 "condition expression based on "
8075 "integer induction.\n");
8076 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
8077 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
8079 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
8082 else if (cond_reduc_dt
== vect_constant_def
)
8084 enum vect_def_type cond_initial_dt
;
8085 tree cond_initial_val
= vect_phi_initial_value (reduc_def_phi
);
8086 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
8087 if (cond_initial_dt
== vect_constant_def
8088 && types_compatible_p (TREE_TYPE (cond_initial_val
),
8089 TREE_TYPE (cond_reduc_val
)))
8091 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
8092 cond_initial_val
, cond_reduc_val
);
8093 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
8095 if (dump_enabled_p ())
8096 dump_printf_loc (MSG_NOTE
, vect_location
,
8097 "condition expression based on "
8098 "compile time constant.\n");
8099 /* Record reduction code at analysis stage. */
8100 STMT_VINFO_REDUC_CODE (reduc_info
)
8101 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
8102 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
8108 if (STMT_VINFO_LIVE_P (phi_info
))
8114 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
8116 gcc_assert (ncopies
>= 1);
8118 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
8122 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
8123 == vect_double_reduction_def
);
8124 double_reduc
= true;
8127 /* 4.2. Check support for the epilog operation.
8129 If STMT represents a reduction pattern, then the type of the
8130 reduction variable may be different than the type of the rest
8131 of the arguments. For example, consider the case of accumulation
8132 of shorts into an int accumulator; The original code:
8133 S1: int_a = (int) short_a;
8134 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
8137 STMT: int_acc = widen_sum <short_a, int_acc>
8140 1. The tree-code that is used to create the vector operation in the
8141 epilog code (that reduces the partial results) is not the
8142 tree-code of STMT, but is rather the tree-code of the original
8143 stmt from the pattern that STMT is replacing. I.e, in the example
8144 above we want to use 'widen_sum' in the loop, but 'plus' in the
8146 2. The type (mode) we use to check available target support
8147 for the vector operation to be created in the *epilog*, is
8148 determined by the type of the reduction variable (in the example
8149 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
8150 However the type (mode) we use to check available target support
8151 for the vector operation to be created *inside the loop*, is
8152 determined by the type of the other arguments to STMT (in the
8153 example we'd check this: optab_handler (widen_sum_optab,
8156 This is contrary to "regular" reductions, in which the types of all
8157 the arguments are the same as the type of the reduction variable.
8158 For "regular" reductions we can therefore use the same vector type
8159 (and also the same tree-code) when generating the epilog code and
8160 when generating the code inside the loop. */
8162 code_helper orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
8164 /* If conversion might have created a conditional operation like
8165 IFN_COND_ADD already. Use the internal code for the following checks. */
8166 if (orig_code
.is_internal_fn ())
8168 tree_code new_code
= conditional_internal_fn_code (internal_fn (orig_code
));
8169 orig_code
= new_code
!= ERROR_MARK
? new_code
: orig_code
;
8172 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
8174 reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
8175 if (reduction_type
== TREE_CODE_REDUCTION
)
8177 /* Check whether it's ok to change the order of the computation.
8178 Generally, when vectorizing a reduction we change the order of the
8179 computation. This may change the behavior of the program in some
8180 cases, so we need to check that this is ok. One exception is when
8181 vectorizing an outer-loop: the inner-loop is executed sequentially,
8182 and therefore vectorizing reductions in the inner-loop during
8183 outer-loop vectorization is safe. Likewise when we are vectorizing
8184 a series of reductions using SLP and the VF is one the reductions
8185 are performed in scalar order. */
8187 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
8188 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
8190 else if (needs_fold_left_reduction_p (op
.type
, orig_code
))
8192 /* When vectorizing a reduction chain w/o SLP the reduction PHI
8193 is not directy used in stmt. */
8194 if (!only_slp_reduc_chain
8195 && reduc_chain_length
!= 1)
8197 if (dump_enabled_p ())
8198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8199 "in-order reduction chain without SLP.\n");
8202 STMT_VINFO_REDUC_TYPE (reduc_info
)
8203 = reduction_type
= FOLD_LEFT_REDUCTION
;
8205 else if (!commutative_binary_op_p (orig_code
, op
.type
)
8206 || !associative_binary_op_p (orig_code
, op
.type
))
8208 if (dump_enabled_p ())
8209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8210 "reduction: not commutative/associative\n");
8215 if ((reduction_type
== COND_REDUCTION
8216 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
8217 || reduction_type
== CONST_COND_REDUCTION
8218 || reduction_type
== EXTRACT_LAST_REDUCTION
)
8220 && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
) > 1)
8222 if (dump_enabled_p ())
8223 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8224 "multiple types in condition reduction.\n");
8228 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
8231 if (dump_enabled_p ())
8232 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8233 "multiple types in double reduction or condition "
8234 "reduction or fold-left reduction.\n");
8238 internal_fn reduc_fn
= IFN_LAST
;
8239 if (reduction_type
== TREE_CODE_REDUCTION
8240 || reduction_type
== FOLD_LEFT_REDUCTION
8241 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
8242 || reduction_type
== CONST_COND_REDUCTION
)
8244 if (reduction_type
== FOLD_LEFT_REDUCTION
8245 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
8246 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
8248 if (reduc_fn
!= IFN_LAST
8249 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
8250 OPTIMIZE_FOR_SPEED
))
8252 if (dump_enabled_p ())
8253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8254 "reduc op not supported by target.\n");
8256 reduc_fn
= IFN_LAST
;
8261 if (!nested_cycle
|| double_reduc
)
8263 if (dump_enabled_p ())
8264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8265 "no reduc code for scalar code.\n");
8271 else if (reduction_type
== COND_REDUCTION
)
8273 int scalar_precision
8274 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op
.type
));
8275 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
8276 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
8279 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
8280 OPTIMIZE_FOR_SPEED
))
8281 reduc_fn
= IFN_REDUC_MAX
;
8283 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
8285 if (reduction_type
!= EXTRACT_LAST_REDUCTION
8286 && (!nested_cycle
|| double_reduc
)
8287 && reduc_fn
== IFN_LAST
8288 && !nunits_out
.is_constant ())
8290 if (dump_enabled_p ())
8291 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8292 "missing target support for reduction on"
8293 " variable-length vectors.\n");
8297 /* For SLP reductions, see if there is a neutral value we can use. */
8298 tree neutral_op
= NULL_TREE
;
8301 tree initial_value
= NULL_TREE
;
8302 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
)
8303 initial_value
= vect_phi_initial_value (reduc_def_phi
);
8304 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype_out
),
8305 orig_code
, initial_value
);
8308 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
8310 /* We can't support in-order reductions of code such as this:
8312 for (int i = 0; i < n1; ++i)
8313 for (int j = 0; j < n2; ++j)
8316 since GCC effectively transforms the loop when vectorizing:
8318 for (int i = 0; i < n1 / VF; ++i)
8319 for (int j = 0; j < n2; ++j)
8320 for (int k = 0; k < VF; ++k)
8323 which is a reassociation of the original operation. */
8324 if (dump_enabled_p ())
8325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8326 "in-order double reduction not supported.\n");
8331 if (reduction_type
== FOLD_LEFT_REDUCTION
8332 && (slp_node
&& SLP_TREE_LANES (slp_node
) > 1)
8333 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
8335 /* We cannot use in-order reductions in this case because there is
8336 an implicit reassociation of the operations involved. */
8337 if (dump_enabled_p ())
8338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8339 "in-order unchained SLP reductions not supported.\n");
8343 /* For double reductions, and for SLP reductions with a neutral value,
8344 we construct a variable-length initial vector by loading a vector
8345 full of the neutral value and then shift-and-inserting the start
8346 values into the low-numbered elements. */
8347 if ((double_reduc
|| neutral_op
)
8348 && !nunits_out
.is_constant ()
8349 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
8350 vectype_out
, OPTIMIZE_FOR_SPEED
))
8352 if (dump_enabled_p ())
8353 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8354 "reduction on variable-length vectors requires"
8355 " target support for a vector-shift-and-insert"
8360 /* Check extra constraints for variable-length unchained SLP reductions. */
8362 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
8363 && !nunits_out
.is_constant ())
8365 /* We checked above that we could build the initial vector when
8366 there's a neutral element value. Check here for the case in
8367 which each SLP statement has its own initial value and in which
8368 that value needs to be repeated for every instance of the
8369 statement within the initial vector. */
8370 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
8372 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
8373 TREE_TYPE (vectype_out
)))
8375 if (dump_enabled_p ())
8376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8377 "unsupported form of SLP reduction for"
8378 " variable-length vectors: cannot build"
8379 " initial vector.\n");
8382 /* The epilogue code relies on the number of elements being a multiple
8383 of the group size. The duplicate-and-interleave approach to setting
8384 up the initial vector does too. */
8385 if (!multiple_p (nunits_out
, group_size
))
8387 if (dump_enabled_p ())
8388 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8389 "unsupported form of SLP reduction for"
8390 " variable-length vectors: the vector size"
8391 " is not a multiple of the number of results.\n");
8396 if (reduction_type
== COND_REDUCTION
)
8400 if (! max_loop_iterations (loop
, &ni
))
8402 if (dump_enabled_p ())
8403 dump_printf_loc (MSG_NOTE
, vect_location
,
8404 "loop count not known, cannot create cond "
8408 /* Convert backedges to iterations. */
8411 /* The additional index will be the same type as the condition. Check
8412 that the loop can fit into this less one (because we'll use up the
8413 zero slot for when there are no matches). */
8414 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
8415 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
8417 if (dump_enabled_p ())
8418 dump_printf_loc (MSG_NOTE
, vect_location
,
8419 "loop size is greater than data size.\n");
8424 /* In case the vectorization factor (VF) is bigger than the number
8425 of elements that we can fit in a vectype (nunits), we have to generate
8426 more than one vector stmt - i.e - we need to "unroll" the
8427 vector stmt by a factor VF/nunits. For more details see documentation
8428 in vectorizable_operation. */
8430 /* If the reduction is used in an outer loop we need to generate
8431 VF intermediate results, like so (e.g. for ncopies=2):
8436 (i.e. we generate VF results in 2 registers).
8437 In this case we have a separate def-use cycle for each copy, and therefore
8438 for each copy we get the vector def for the reduction variable from the
8439 respective phi node created for this copy.
8441 Otherwise (the reduction is unused in the loop nest), we can combine
8442 together intermediate results, like so (e.g. for ncopies=2):
8446 (i.e. we generate VF/2 results in a single register).
8447 In this case for each copy we get the vector def for the reduction variable
8448 from the vectorized reduction operation generated in the previous iteration.
8450 This only works when we see both the reduction PHI and its only consumer
8451 in vectorizable_reduction and there are no intermediate stmts
8452 participating. When unrolling we want each unrolled iteration to have its
8453 own reduction accumulator since one of the main goals of unrolling a
8454 reduction is to reduce the aggregate loop-carried latency. */
8457 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
8458 && SLP_TREE_LANES (slp_node
) == 1
8459 && vect_get_num_copies (loop_vinfo
, vectype_in
) > 1))
8460 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
8461 && reduc_chain_length
== 1
8462 && loop_vinfo
->suggested_unroll_factor
== 1)
8463 single_defuse_cycle
= true;
8465 if (single_defuse_cycle
&& !lane_reducing
)
8467 gcc_assert (op
.code
!= COND_EXPR
);
8469 /* 4. check support for the operation in the loop
8471 This isn't necessary for the lane reduction codes, since they
8472 can only be produced by pattern matching, and it's up to the
8473 pattern matcher to test for support. The main reason for
8474 specifically skipping this step is to avoid rechecking whether
8475 mixed-sign dot-products can be implemented using signed
8477 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
8478 if (!directly_supported_p (op
.code
, vectype_in
, optab_vector
))
8480 if (dump_enabled_p ())
8481 dump_printf (MSG_NOTE
, "op not supported by target.\n");
8482 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
8483 || !vect_can_vectorize_without_simd_p (op
.code
))
8484 single_defuse_cycle
= false;
8486 if (dump_enabled_p ())
8487 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
8490 if (vect_emulated_vector_p (vectype_in
)
8491 && !vect_can_vectorize_without_simd_p (op
.code
))
8493 if (dump_enabled_p ())
8494 dump_printf (MSG_NOTE
, "using word mode not possible.\n");
8498 if (dump_enabled_p () && single_defuse_cycle
)
8499 dump_printf_loc (MSG_NOTE
, vect_location
,
8500 "using single def-use cycle for reduction by reducing "
8501 "multiple vectors to one in the loop body\n");
8502 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
8504 /* For lane-reducing operation, the below processing related to single
8505 defuse-cycle will be done in its own vectorizable function. One more
8506 thing to note is that the operation must not be involved in fold-left
8508 single_defuse_cycle
&= !lane_reducing
;
8511 && (single_defuse_cycle
|| reduction_type
== FOLD_LEFT_REDUCTION
))
8512 for (i
= 0; i
< (int) op
.num_ops
; i
++)
8513 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_op
[i
]))
8515 if (dump_enabled_p ())
8516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8517 "incompatible vector types for invariants\n");
8521 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
8522 reduction_type
, ncopies
, cost_vec
);
8523 /* Cost the reduction op inside the loop if transformed via
8524 vect_transform_reduction for non-lane-reducing operation. Otherwise
8525 this is costed by the separate vectorizable_* routines. */
8526 if (single_defuse_cycle
)
8527 record_stmt_cost (cost_vec
, ncopies
, vector_stmt
, stmt_info
, 0, vect_body
);
8529 if (dump_enabled_p ()
8530 && reduction_type
== FOLD_LEFT_REDUCTION
)
8531 dump_printf_loc (MSG_NOTE
, vect_location
,
8532 "using an in-order (fold-left) reduction.\n");
8533 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
8535 /* All but single defuse-cycle optimized and fold-left reductions go
8536 through their own vectorizable_* routines. */
8537 if (!single_defuse_cycle
&& reduction_type
!= FOLD_LEFT_REDUCTION
)
8540 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
8541 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
8543 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
8544 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
8546 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
8547 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
8549 else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
8550 vect_reduction_update_partial_vector_usage (loop_vinfo
, reduc_info
,
8551 slp_node
, op
.code
, op
.type
,
8556 /* STMT_INFO is a dot-product reduction whose multiplication operands
8557 have different signs. Emit a sequence to emulate the operation
8558 using a series of signed DOT_PROD_EXPRs and return the last
8559 statement generated. VEC_DEST is the result of the vector operation
8560 and VOP lists its inputs. */
8563 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8564 gimple_stmt_iterator
*gsi
, tree vec_dest
,
8567 tree wide_vectype
= signed_type_for (TREE_TYPE (vec_dest
));
8568 tree narrow_vectype
= signed_type_for (TREE_TYPE (vop
[0]));
8569 tree narrow_elttype
= TREE_TYPE (narrow_vectype
);
8572 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8573 if (!TYPE_UNSIGNED (TREE_TYPE (vop
[0])))
8574 std::swap (vop
[0], vop
[1]);
8576 /* Convert all inputs to signed types. */
8577 for (int i
= 0; i
< 3; ++i
)
8578 if (TYPE_UNSIGNED (TREE_TYPE (vop
[i
])))
8580 tree tmp
= make_ssa_name (signed_type_for (TREE_TYPE (vop
[i
])));
8581 new_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, vop
[i
]);
8582 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8586 /* In the comments below we assume 8-bit inputs for simplicity,
8587 but the approach works for any full integer type. */
8589 /* Create a vector of -128. */
8590 tree min_narrow_elttype
= TYPE_MIN_VALUE (narrow_elttype
);
8591 tree min_narrow
= build_vector_from_val (narrow_vectype
,
8592 min_narrow_elttype
);
8594 /* Create a vector of 64. */
8595 auto half_wi
= wi::lrshift (wi::to_wide (min_narrow_elttype
), 1);
8596 tree half_narrow
= wide_int_to_tree (narrow_elttype
, half_wi
);
8597 half_narrow
= build_vector_from_val (narrow_vectype
, half_narrow
);
8599 /* Emit: SUB_RES = VOP[0] - 128. */
8600 tree sub_res
= make_ssa_name (narrow_vectype
);
8601 new_stmt
= gimple_build_assign (sub_res
, PLUS_EXPR
, vop
[0], min_narrow
);
8602 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8606 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8607 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8608 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8610 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8611 Doing the two 64 * y steps first allows more time to compute x. */
8612 tree stage1
= make_ssa_name (wide_vectype
);
8613 new_stmt
= gimple_build_assign (stage1
, DOT_PROD_EXPR
,
8614 vop
[1], half_narrow
, vop
[2]);
8615 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8617 tree stage2
= make_ssa_name (wide_vectype
);
8618 new_stmt
= gimple_build_assign (stage2
, DOT_PROD_EXPR
,
8619 vop
[1], half_narrow
, stage1
);
8620 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8622 tree stage3
= make_ssa_name (wide_vectype
);
8623 new_stmt
= gimple_build_assign (stage3
, DOT_PROD_EXPR
,
8624 sub_res
, vop
[1], stage2
);
8625 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
8627 /* Convert STAGE3 to the reduction type. */
8628 return gimple_build_assign (vec_dest
, CONVERT_EXPR
, stage3
);
8631 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8635 vect_transform_reduction (loop_vec_info loop_vinfo
,
8636 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
8637 gimple
**vec_stmt
, slp_tree slp_node
)
8639 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
8640 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8644 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
8645 gcc_assert (reduc_info
->is_reduc_info
);
8647 if (nested_in_vect_loop_p (loop
, stmt_info
))
8650 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
8654 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
8657 /* All uses but the last are expected to be defined in the loop.
8658 The last use is the reduction variable. In case of nested cycle this
8659 assumption is not true: we use reduc_index to record the index of the
8660 reduction variable. */
8661 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
8662 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
8663 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
8664 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (stmt_info
);
8667 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
8672 vec_num
= vect_get_num_copies (loop_vinfo
, slp_node
, vectype_in
);
8676 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
8680 code_helper code
= canonicalize_code (op
.code
, op
.type
);
8681 internal_fn cond_fn
= get_conditional_internal_fn (code
, op
.type
);
8683 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
8684 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
8685 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
8688 tree new_temp
= NULL_TREE
;
8689 auto_vec
<tree
> vec_oprnds
[3];
8691 if (dump_enabled_p ())
8692 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
8694 /* FORNOW: Multiple types are not supported for condition. */
8695 if (code
== COND_EXPR
)
8696 gcc_assert (ncopies
== 1);
8698 /* A binary COND_OP reduction must have the same definition and else
8700 bool cond_fn_p
= code
.is_internal_fn ()
8701 && conditional_internal_fn_code (internal_fn (code
)) != ERROR_MARK
;
8704 gcc_assert (code
== IFN_COND_ADD
|| code
== IFN_COND_SUB
8705 || code
== IFN_COND_MUL
|| code
== IFN_COND_AND
8706 || code
== IFN_COND_IOR
|| code
== IFN_COND_XOR
8707 || code
== IFN_COND_MIN
|| code
== IFN_COND_MAX
);
8708 gcc_assert (op
.num_ops
== 4
8709 && (op
.ops
[reduc_index
]
8710 == op
.ops
[internal_fn_else_index ((internal_fn
) code
)]));
8713 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8715 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
8716 if (reduction_type
== FOLD_LEFT_REDUCTION
)
8718 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
8719 gcc_assert (code
.is_tree_code () || cond_fn_p
);
8720 return vectorize_fold_left_reduction
8721 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
,
8722 code
, reduc_fn
, op
.ops
, op
.num_ops
, vectype_in
,
8723 reduc_index
, masks
, lens
);
8726 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
8727 bool lane_reducing
= lane_reducing_op_p (code
);
8728 gcc_assert (single_defuse_cycle
|| lane_reducing
);
8732 /* The last operand of lane-reducing op is for reduction. */
8733 gcc_assert (reduc_index
== (int) op
.num_ops
- 1);
8736 /* Create the destination vector */
8737 tree scalar_dest
= gimple_get_lhs (stmt_info
->stmt
);
8738 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
8740 if (lane_reducing
&& !slp_node
&& !single_defuse_cycle
)
8742 /* Note: there are still vectorizable cases that can not be handled by
8743 single-lane slp. Probably it would take some time to evolve the
8744 feature to a mature state. So we have to keep the below non-slp code
8745 path as failsafe for lane-reducing support. */
8746 gcc_assert (op
.num_ops
<= 3);
8747 for (unsigned i
= 0; i
< op
.num_ops
; i
++)
8749 unsigned oprnd_ncopies
= ncopies
;
8751 if ((int) i
== reduc_index
)
8753 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8754 oprnd_ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8757 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, oprnd_ncopies
,
8758 op
.ops
[i
], &vec_oprnds
[i
]);
8761 /* Get NCOPIES vector definitions for all operands except the reduction
8763 else if (!cond_fn_p
)
8765 gcc_assert (reduc_index
>= 0 && reduc_index
<= 2);
8766 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
8767 single_defuse_cycle
&& reduc_index
== 0
8768 ? NULL_TREE
: op
.ops
[0], &vec_oprnds
[0],
8769 single_defuse_cycle
&& reduc_index
== 1
8770 ? NULL_TREE
: op
.ops
[1], &vec_oprnds
[1],
8772 && !(single_defuse_cycle
&& reduc_index
== 2)
8773 ? op
.ops
[2] : NULL_TREE
, &vec_oprnds
[2]);
8777 /* For a conditional operation pass the truth type as mask
8779 gcc_assert (single_defuse_cycle
8780 && (reduc_index
== 1 || reduc_index
== 2));
8781 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
, op
.ops
[0],
8782 truth_type_for (vectype_in
), &vec_oprnds
[0],
8783 reduc_index
== 1 ? NULL_TREE
: op
.ops
[1],
8784 NULL_TREE
, &vec_oprnds
[1],
8785 reduc_index
== 2 ? NULL_TREE
: op
.ops
[2],
8786 NULL_TREE
, &vec_oprnds
[2]);
8789 /* For single def-use cycles get one copy of the vectorized reduction
8791 if (single_defuse_cycle
)
8793 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, 1,
8794 reduc_index
== 0 ? op
.ops
[0] : NULL_TREE
,
8796 reduc_index
== 1 ? op
.ops
[1] : NULL_TREE
,
8798 reduc_index
== 2 ? op
.ops
[2] : NULL_TREE
,
8801 else if (lane_reducing
)
8803 /* For normal reduction, consistency between vectorized def/use is
8804 naturally ensured when mapping from scalar statement. But if lane-
8805 reducing op is involved in reduction, thing would become somewhat
8806 complicated in that the op's result and operand for accumulation are
8807 limited to less lanes than other operands, which certainly causes
8808 def/use mismatch on adjacent statements around the op if do not have
8809 any kind of specific adjustment. One approach is to refit lane-
8810 reducing op in the way of introducing new trivial pass-through copies
8811 to fix possible def/use gap, so as to make it behave like a normal op.
8812 And vector reduction PHIs are always generated to the full extent, no
8813 matter lane-reducing op exists or not. If some copies or PHIs are
8814 actually superfluous, they would be cleaned up by passes after
8815 vectorization. An example for single-lane slp, lane-reducing ops
8816 with mixed input vectypes in a reduction chain, is given as below.
8817 Similarly, this handling is applicable for multiple-lane slp as well.
8822 sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8823 sum += w[i]; // widen-sum <vector(16) char>
8824 sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8825 sum += n[i]; // normal <vector(4) int>
8828 The vector size is 128-bit,vectorization factor is 16. Reduction
8829 statements would be transformed as:
8831 vector<4> int sum_v0 = { 0, 0, 0, 1 };
8832 vector<4> int sum_v1 = { 0, 0, 0, 0 };
8833 vector<4> int sum_v2 = { 0, 0, 0, 0 };
8834 vector<4> int sum_v3 = { 0, 0, 0, 0 };
8838 sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8839 sum_v1 = sum_v1; // copy
8840 sum_v2 = sum_v2; // copy
8841 sum_v3 = sum_v3; // copy
8843 sum_v0 = sum_v0; // copy
8844 sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8845 sum_v2 = sum_v2; // copy
8846 sum_v3 = sum_v3; // copy
8848 sum_v0 = sum_v0; // copy
8849 sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8850 sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8851 sum_v3 = sum_v3; // copy
8853 sum_v0 += n_v0[i: 0 ~ 3 ];
8854 sum_v1 += n_v1[i: 4 ~ 7 ];
8855 sum_v2 += n_v2[i: 8 ~ 11];
8856 sum_v3 += n_v3[i: 12 ~ 15];
8859 Moreover, for a higher instruction parallelism in final vectorized
8860 loop, it is considered to make those effective vector lane-reducing
8861 ops be distributed evenly among all def-use cycles. In the above
8862 example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8863 cycles, instruction dependency among them could be eliminated. */
8864 unsigned effec_ncopies
= vec_oprnds
[0].length ();
8865 unsigned total_ncopies
= vec_oprnds
[reduc_index
].length ();
8867 gcc_assert (effec_ncopies
<= total_ncopies
);
8869 if (effec_ncopies
< total_ncopies
)
8871 for (unsigned i
= 0; i
< op
.num_ops
- 1; i
++)
8873 gcc_assert (vec_oprnds
[i
].length () == effec_ncopies
);
8874 vec_oprnds
[i
].safe_grow_cleared (total_ncopies
);
8878 tree reduc_vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
8879 gcc_assert (reduc_vectype_in
);
8881 unsigned effec_reduc_ncopies
8882 = vect_get_num_copies (loop_vinfo
, slp_node
, reduc_vectype_in
);
8884 gcc_assert (effec_ncopies
<= effec_reduc_ncopies
);
8886 if (effec_ncopies
< effec_reduc_ncopies
)
8888 /* Find suitable def-use cycles to generate vectorized statements
8889 into, and reorder operands based on the selection. */
8890 unsigned curr_pos
= reduc_info
->reduc_result_pos
;
8891 unsigned next_pos
= (curr_pos
+ effec_ncopies
) % effec_reduc_ncopies
;
8893 gcc_assert (curr_pos
< effec_reduc_ncopies
);
8894 reduc_info
->reduc_result_pos
= next_pos
;
8898 unsigned count
= effec_reduc_ncopies
- effec_ncopies
;
8899 unsigned start
= curr_pos
- count
;
8901 if ((int) start
< 0)
8907 for (unsigned i
= 0; i
< op
.num_ops
- 1; i
++)
8909 for (unsigned j
= effec_ncopies
; j
> start
; j
--)
8912 std::swap (vec_oprnds
[i
][k
], vec_oprnds
[i
][k
+ count
]);
8913 gcc_assert (!vec_oprnds
[i
][k
]);
8920 bool emulated_mixed_dot_prod
= vect_is_emulated_mixed_dot_prod (stmt_info
);
8921 unsigned num
= vec_oprnds
[reduc_index
== 0 ? 1 : 0].length ();
8922 unsigned mask_index
= 0;
8924 for (unsigned i
= 0; i
< num
; ++i
)
8927 tree vop
[3] = { vec_oprnds
[0][i
], vec_oprnds
[1][i
], NULL_TREE
};
8928 if (!vop
[0] || !vop
[1])
8930 tree reduc_vop
= vec_oprnds
[reduc_index
][i
];
8932 /* If could not generate an effective vector statement for current
8933 portion of reduction operand, insert a trivial copy to simply
8934 handle over the operand to other dependent statements. */
8935 gcc_assert (reduc_vop
);
8937 if (slp_node
&& TREE_CODE (reduc_vop
) == SSA_NAME
8938 && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop
))
8939 new_stmt
= SSA_NAME_DEF_STMT (reduc_vop
);
8942 new_temp
= make_ssa_name (vec_dest
);
8943 new_stmt
= gimple_build_assign (new_temp
, reduc_vop
);
8944 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
,
8948 else if (masked_loop_p
&& !mask_by_cond_expr
)
8950 /* No conditional ifns have been defined for lane-reducing op
8952 gcc_assert (!lane_reducing
);
8954 /* Make sure that the reduction accumulator is vop[0]. */
8955 if (reduc_index
== 1)
8957 gcc_assert (commutative_binary_op_p (code
, op
.type
));
8958 std::swap (vop
[0], vop
[1]);
8960 tree mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
,
8961 vec_num
* ncopies
, vectype_in
,
8963 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
8964 vop
[0], vop
[1], vop
[0]);
8965 new_temp
= make_ssa_name (vec_dest
, call
);
8966 gimple_call_set_lhs (call
, new_temp
);
8967 gimple_call_set_nothrow (call
, true);
8968 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
8973 if (op
.num_ops
>= 3)
8974 vop
[2] = vec_oprnds
[2][i
];
8976 if (masked_loop_p
&& mask_by_cond_expr
)
8978 tree mask
= vect_get_loop_mask (loop_vinfo
, gsi
, masks
,
8979 vec_num
* ncopies
, vectype_in
,
8981 build_vect_cond_expr (code
, vop
, mask
, gsi
);
8984 if (emulated_mixed_dot_prod
)
8985 new_stmt
= vect_emulate_mixed_dot_prod (loop_vinfo
, stmt_info
, gsi
,
8988 else if (code
.is_internal_fn () && !cond_fn_p
)
8989 new_stmt
= gimple_build_call_internal (internal_fn (code
),
8991 vop
[0], vop
[1], vop
[2]);
8992 else if (code
.is_internal_fn () && cond_fn_p
)
8993 new_stmt
= gimple_build_call_internal (internal_fn (code
),
8995 vop
[0], vop
[1], vop
[2],
8998 new_stmt
= gimple_build_assign (vec_dest
, tree_code (op
.code
),
8999 vop
[0], vop
[1], vop
[2]);
9000 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
9001 gimple_set_lhs (new_stmt
, new_temp
);
9002 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
9005 if (single_defuse_cycle
&& i
< num
- 1)
9006 vec_oprnds
[reduc_index
].safe_push (gimple_get_lhs (new_stmt
));
9008 slp_node
->push_vec_def (new_stmt
);
9010 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
9014 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
9019 /* Transform phase of a cycle PHI. */
9022 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
9023 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
9024 slp_tree slp_node
, slp_instance slp_node_instance
)
9026 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
9027 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9031 bool nested_cycle
= false;
9034 if (nested_in_vect_loop_p (loop
, stmt_info
))
9037 nested_cycle
= true;
9040 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
9041 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
9042 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
9043 gcc_assert (reduc_info
->is_reduc_info
);
9045 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
9046 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
9047 /* Leave the scalar phi in place. */
9052 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9058 ncopies
= vect_get_num_copies (loop_vinfo
,
9059 STMT_VINFO_VECTYPE (stmt_info
));
9062 /* Check whether we should use a single PHI node and accumulate
9063 vectors to one before the backedge. */
9064 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
9070 /* Create the destination vector */
9071 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
9072 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
9075 /* Get the loop-entry arguments. */
9076 tree vec_initial_def
= NULL_TREE
;
9077 auto_vec
<tree
> vec_initial_defs
;
9080 vec_initial_defs
.reserve (vec_num
);
9081 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9082 and we can't use zero for induc_val, use initial_def. Similarly
9083 for REDUC_MIN and initial_def larger than the base. */
9084 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
9086 gcc_assert (SLP_TREE_LANES (slp_node
) == 1);
9087 tree initial_def
= vect_phi_initial_value (phi
);
9088 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
9089 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
9090 if (TREE_CODE (initial_def
) == INTEGER_CST
9091 && !integer_zerop (induc_val
)
9092 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
9093 && tree_int_cst_lt (initial_def
, induc_val
))
9094 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
9095 && tree_int_cst_lt (induc_val
, initial_def
))))
9097 induc_val
= initial_def
;
9098 /* Communicate we used the initial_def to epilouge
9100 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
9102 vec_initial_defs
.quick_push
9103 (build_vector_from_val (vectype_out
, induc_val
));
9105 else if (nested_cycle
)
9107 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
9108 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
9113 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
9114 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
9115 vec
<stmt_vec_info
> &stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
9117 unsigned int num_phis
= stmts
.length ();
9118 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
9120 initial_values
.reserve (num_phis
);
9121 for (unsigned int i
= 0; i
< num_phis
; ++i
)
9123 gphi
*this_phi
= as_a
<gphi
*> (stmts
[i
]->stmt
);
9124 initial_values
.quick_push (vect_phi_initial_value (this_phi
));
9127 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
9128 if (!initial_values
.is_empty ())
9131 = (num_phis
== 1 ? initial_values
[0] : NULL_TREE
);
9132 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
9134 = neutral_op_for_reduction (TREE_TYPE (vectype_out
),
9135 code
, initial_value
);
9136 get_initial_defs_for_reduction (loop_vinfo
, reduc_info
,
9137 &vec_initial_defs
, vec_num
,
9138 stmts
.length (), neutral_op
);
9144 /* Get at the scalar def before the loop, that defines the initial
9145 value of the reduction variable. */
9146 tree initial_def
= vect_phi_initial_value (phi
);
9147 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
9148 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9149 and we can't use zero for induc_val, use initial_def. Similarly
9150 for REDUC_MIN and initial_def larger than the base. */
9151 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
9153 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
9154 if (TREE_CODE (initial_def
) == INTEGER_CST
9155 && !integer_zerop (induc_val
)
9156 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
9157 && tree_int_cst_lt (initial_def
, induc_val
))
9158 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
9159 && tree_int_cst_lt (induc_val
, initial_def
))))
9161 induc_val
= initial_def
;
9162 /* Communicate we used the initial_def to epilouge
9164 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
9166 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
9168 else if (nested_cycle
)
9170 /* Do not use an adjustment def as that case is not supported
9171 correctly if ncopies is not one. */
9172 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
9173 ncopies
, initial_def
,
9176 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == CONST_COND_REDUCTION
9177 || STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
9178 /* Fill the initial vector with the initial scalar value. */
9180 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
,
9181 initial_def
, initial_def
);
9185 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
9186 if (!reduc_info
->reduc_initial_values
.is_empty ())
9188 initial_def
= reduc_info
->reduc_initial_values
[0];
9189 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
9191 = neutral_op_for_reduction (TREE_TYPE (initial_def
),
9193 gcc_assert (neutral_op
);
9194 /* Try to simplify the vector initialization by applying an
9195 adjustment after the reduction has been performed. */
9196 if (!reduc_info
->reused_accumulator
9197 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
9198 && !operand_equal_p (neutral_op
, initial_def
))
9200 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
)
9202 initial_def
= neutral_op
;
9205 = get_initial_def_for_reduction (loop_vinfo
, reduc_info
,
9206 initial_def
, neutral_op
);
9211 if (vec_initial_def
)
9213 vec_initial_defs
.create (ncopies
);
9214 for (i
= 0; i
< ncopies
; ++i
)
9215 vec_initial_defs
.quick_push (vec_initial_def
);
9218 if (auto *accumulator
= reduc_info
->reused_accumulator
)
9220 tree def
= accumulator
->reduc_input
;
9221 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
9223 unsigned int nreduc
;
9224 bool res
= constant_multiple_p (TYPE_VECTOR_SUBPARTS
9226 TYPE_VECTOR_SUBPARTS (vectype_out
),
9229 gimple_seq stmts
= NULL
;
9230 /* Reduce the single vector to a smaller one. */
9233 /* Perform the reduction in the appropriate type. */
9234 tree rvectype
= vectype_out
;
9235 if (!useless_type_conversion_p (TREE_TYPE (vectype_out
),
9236 TREE_TYPE (TREE_TYPE (def
))))
9237 rvectype
= build_vector_type (TREE_TYPE (TREE_TYPE (def
)),
9238 TYPE_VECTOR_SUBPARTS
9240 def
= vect_create_partial_epilog (def
, rvectype
,
9241 STMT_VINFO_REDUC_CODE
9245 /* The epilogue loop might use a different vector mode, like
9247 if (TYPE_MODE (vectype_out
) != TYPE_MODE (TREE_TYPE (def
)))
9249 tree reduc_type
= build_vector_type_for_mode
9250 (TREE_TYPE (TREE_TYPE (def
)), TYPE_MODE (vectype_out
));
9251 def
= gimple_convert (&stmts
, reduc_type
, def
);
9253 /* Adjust the input so we pick up the partially reduced value
9254 for the skip edge in vect_create_epilog_for_reduction. */
9255 accumulator
->reduc_input
= def
;
9256 /* And the reduction could be carried out using a different sign. */
9257 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
9258 def
= gimple_convert (&stmts
, vectype_out
, def
);
9260 if ((e
= loop_vinfo
->main_loop_edge
)
9261 || (e
= loop_vinfo
->skip_this_loop_edge
))
9263 /* While we'd like to insert on the edge this will split
9264 blocks and disturb bookkeeping, we also will eventually
9265 need this on the skip edge. Rely on sinking to
9266 fixup optimal placement and insert in the pred. */
9267 gimple_stmt_iterator gsi
= gsi_last_bb (e
->src
);
9268 /* Insert before a cond that eventually skips the
9270 if (!gsi_end_p (gsi
) && stmt_ends_bb_p (gsi_stmt (gsi
)))
9272 gsi_insert_seq_after (&gsi
, stmts
, GSI_CONTINUE_LINKING
);
9275 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
),
9278 if (loop_vinfo
->main_loop_edge
)
9280 = vect_get_main_loop_result (loop_vinfo
, def
,
9281 vec_initial_defs
[0]);
9283 vec_initial_defs
.safe_push (def
);
9286 /* Generate the reduction PHIs upfront. */
9287 for (i
= 0; i
< vec_num
; i
++)
9289 tree vec_init_def
= vec_initial_defs
[i
];
9290 for (j
= 0; j
< ncopies
; j
++)
9292 /* Create the reduction-phi that defines the reduction
9294 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
9296 /* Set the loop-entry arg of the reduction-phi. */
9297 if (j
!= 0 && nested_cycle
)
9298 vec_init_def
= vec_initial_defs
[j
];
9299 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
9302 /* The loop-latch arg is set in epilogue processing. */
9305 slp_node
->push_vec_def (new_phi
);
9309 *vec_stmt
= new_phi
;
9310 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
9318 /* Vectorizes LC PHIs. */
9321 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
9322 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
9326 || !is_a
<gphi
*> (stmt_info
->stmt
)
9327 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
9330 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
9331 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
9334 if (!vec_stmt
) /* transformation not required. */
9336 /* Deal with copies from externs or constants that disguise as
9337 loop-closed PHI nodes (PR97886). */
9339 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
9340 SLP_TREE_VECTYPE (slp_node
)))
9342 if (dump_enabled_p ())
9343 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9344 "incompatible vector types for invariants\n");
9347 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
9351 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9352 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
9353 basic_block bb
= gimple_bb (stmt_info
->stmt
);
9354 edge e
= single_pred_edge (bb
);
9355 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
9356 auto_vec
<tree
> vec_oprnds
;
9357 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
9358 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
9359 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
9360 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
9362 /* Create the vectorized LC PHI node. */
9363 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
9364 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
9366 slp_node
->push_vec_def (new_phi
);
9368 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
9371 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
9376 /* Vectorizes PHIs. */
9379 vectorizable_phi (vec_info
*,
9380 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
9381 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
9383 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
9386 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
9389 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
9391 if (!vec_stmt
) /* transformation not required. */
9395 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
9398 if (dump_enabled_p ())
9399 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9400 "PHI node with unvectorized backedge def\n");
9403 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
9405 if (dump_enabled_p ())
9406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9407 "incompatible vector types for invariants\n");
9410 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
9411 && !useless_type_conversion_p (vectype
,
9412 SLP_TREE_VECTYPE (child
)))
9414 /* With bools we can have mask and non-mask precision vectors
9415 or different non-mask precisions. while pattern recog is
9416 supposed to guarantee consistency here bugs in it can cause
9417 mismatches (PR103489 and PR103800 for example).
9418 Deal with them here instead of ICEing later. */
9419 if (dump_enabled_p ())
9420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9421 "incompatible vector type setup from "
9422 "bool pattern detection\n");
9426 /* For single-argument PHIs assume coalescing which means zero cost
9427 for the scalar and the vector PHIs. This avoids artificially
9428 favoring the vector path (but may pessimize it in some cases). */
9429 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
9430 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
9431 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
9432 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
9436 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
9437 basic_block bb
= gimple_bb (stmt_info
->stmt
);
9438 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
9439 auto_vec
<gphi
*> new_phis
;
9440 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
9442 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
9444 /* Skip not yet vectorized defs. */
9445 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
9446 && SLP_TREE_VEC_DEFS (child
).is_empty ())
9449 auto_vec
<tree
> vec_oprnds
;
9450 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
9451 if (!new_phis
.exists ())
9453 new_phis
.create (vec_oprnds
.length ());
9454 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
9456 /* Create the vectorized LC PHI node. */
9457 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
9458 slp_node
->push_vec_def (new_phis
[j
]);
9461 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
9462 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
9463 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
9465 /* We should have at least one already vectorized child. */
9466 gcc_assert (new_phis
.exists ());
9471 /* Vectorizes first order recurrences. An overview of the transformation
9472 is described below. Suppose we have the following loop.
9475 for (int i = 0; i < n; ++i)
9481 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9482 looks (simplified) like:
9488 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9489 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9492 if (i < n) goto scalar.body
9494 In this example, _2 is a recurrence because it's value depends on the
9495 previous iteration. We vectorize this as (VF = 4)
9498 vect_init = vect_cst(..., ..., ..., 0)
9501 i = PHI <0(vector.preheader), i+4(vector.body)>
9502 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9503 vect_2 = a[i, i+1, i+2, i+3];
9504 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9505 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9506 if (..) goto vector.body
9508 In this function, vectorizable_recurr, we code generate both the
9509 vector PHI node and the permute since those together compute the
9510 vectorized value of the scalar PHI. We do not yet have the
9511 backedge value to fill in there nor into the vec_perm. Those
9512 are filled in maybe_set_vectorized_backedge_value and
9515 TODO: Since the scalar loop does not have a use of the recurrence
9516 outside of the loop the natural way to implement peeling via
9517 vectorizing the live value doesn't work. For now peeling of loops
9518 with a recurrence is not implemented. For SLP the supported cases
9519 are restricted to those requiring a single vector recurrence PHI. */
9522 vectorizable_recurr (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
9523 gimple
**vec_stmt
, slp_tree slp_node
,
9524 stmt_vector_for_cost
*cost_vec
)
9526 if (!loop_vinfo
|| !is_a
<gphi
*> (stmt_info
->stmt
))
9529 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
9531 /* So far we only support first-order recurrence auto-vectorization. */
9532 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
9535 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9538 ncopies
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9540 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9541 poly_int64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9542 unsigned dist
= slp_node
? SLP_TREE_LANES (slp_node
) : 1;
9543 /* We need to be able to make progress with a single vector. */
9544 if (maybe_gt (dist
* 2, nunits
))
9546 if (dump_enabled_p ())
9547 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9548 "first order recurrence exceeds half of "
9553 /* First-order recurrence autovectorization needs to handle permutation
9554 with indices = [nunits-1, nunits, nunits+1, ...]. */
9555 vec_perm_builder
sel (nunits
, 1, 3);
9556 for (int i
= 0; i
< 3; ++i
)
9557 sel
.quick_push (nunits
- dist
+ i
);
9558 vec_perm_indices
indices (sel
, 2, nunits
);
9560 if (!vec_stmt
) /* transformation not required. */
9562 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), TYPE_MODE (vectype
),
9568 /* We eventually need to set a vector type on invariant
9572 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
9573 if (!vect_maybe_update_slp_op_vectype
9574 (child
, SLP_TREE_VECTYPE (slp_node
)))
9576 if (dump_enabled_p ())
9577 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9578 "incompatible vector types for "
9584 /* Verify we have set up compatible types. */
9585 edge le
= loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo
));
9586 tree latch_vectype
= NULL_TREE
;
9589 slp_tree latch_def
= SLP_TREE_CHILDREN (slp_node
)[le
->dest_idx
];
9590 latch_vectype
= SLP_TREE_VECTYPE (latch_def
);
9594 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, le
);
9595 if (TREE_CODE (latch_def
) == SSA_NAME
)
9597 stmt_vec_info latch_def_info
= loop_vinfo
->lookup_def (latch_def
);
9598 latch_def_info
= vect_stmt_to_vectorize (latch_def_info
);
9599 latch_vectype
= STMT_VINFO_VECTYPE (latch_def_info
);
9602 if (!types_compatible_p (latch_vectype
, vectype
))
9605 /* The recurrence costs the initialization vector and one permute
9607 unsigned prologue_cost
= record_stmt_cost (cost_vec
, 1, scalar_to_vec
,
9608 stmt_info
, 0, vect_prologue
);
9609 unsigned inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9610 stmt_info
, 0, vect_body
);
9611 if (dump_enabled_p ())
9612 dump_printf_loc (MSG_NOTE
, vect_location
,
9613 "vectorizable_recurr: inside_cost = %d, "
9614 "prologue_cost = %d .\n", inside_cost
,
9617 STMT_VINFO_TYPE (stmt_info
) = recurr_info_type
;
9621 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
9622 basic_block bb
= gimple_bb (phi
);
9623 tree preheader
= PHI_ARG_DEF_FROM_EDGE (phi
, pe
);
9624 if (!useless_type_conversion_p (TREE_TYPE (vectype
), TREE_TYPE (preheader
)))
9626 gimple_seq stmts
= NULL
;
9627 preheader
= gimple_convert (&stmts
, TREE_TYPE (vectype
), preheader
);
9628 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9630 tree vec_init
= build_vector_from_val (vectype
, preheader
);
9631 vec_init
= vect_init_vector (loop_vinfo
, stmt_info
, vec_init
, vectype
, NULL
);
9633 /* Create the vectorized first-order PHI node. */
9634 tree vec_dest
= vect_get_new_vect_var (vectype
,
9635 vect_simple_var
, "vec_recur_");
9636 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
9637 add_phi_arg (new_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9639 /* Insert shuffles the first-order recurrence autovectorization.
9640 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9641 tree perm
= vect_gen_perm_mask_checked (vectype
, indices
);
9643 /* Insert the required permute after the latch definition. The
9644 second and later operands are tentative and will be updated when we have
9645 vectorized the latch definition. */
9646 edge le
= loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo
));
9647 gimple
*latch_def
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, le
));
9648 gimple_stmt_iterator gsi2
= gsi_for_stmt (latch_def
);
9651 for (unsigned i
= 0; i
< ncopies
; ++i
)
9653 vec_dest
= make_ssa_name (vectype
);
9655 = gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
9656 i
== 0 ? gimple_phi_result (new_phi
) : NULL
,
9658 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, vperm
, &gsi2
);
9661 slp_node
->push_vec_def (vperm
);
9663 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (vperm
);
9667 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
9671 /* Return true if VECTYPE represents a vector that requires lowering
9672 by the vector lowering pass. */
9675 vect_emulated_vector_p (tree vectype
)
9677 return (!VECTOR_MODE_P (TYPE_MODE (vectype
))
9678 && (!VECTOR_BOOLEAN_TYPE_P (vectype
)
9679 || TYPE_PRECISION (TREE_TYPE (vectype
)) != 1));
9682 /* Return true if we can emulate CODE on an integer mode representation
9686 vect_can_vectorize_without_simd_p (tree_code code
)
9704 /* Likewise, but taking a code_helper. */
9707 vect_can_vectorize_without_simd_p (code_helper code
)
9709 return (code
.is_tree_code ()
9710 && vect_can_vectorize_without_simd_p (tree_code (code
)));
9713 /* Create vector init for vectorized iv. */
9715 vect_create_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
9716 tree step_expr
, poly_uint64 nunits
,
9718 enum vect_induction_op_type induction_type
)
9720 unsigned HOST_WIDE_INT const_nunits
;
9721 tree vec_shift
, vec_init
, new_name
;
9723 tree itype
= TREE_TYPE (vectype
);
9725 /* iv_loop is the loop to be vectorized. Create:
9726 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9727 new_name
= gimple_convert (stmts
, itype
, init_expr
);
9728 switch (induction_type
)
9730 case vect_step_op_shr
:
9731 case vect_step_op_shl
:
9732 /* Build the Initial value from shift_expr. */
9733 vec_init
= gimple_build_vector_from_val (stmts
,
9736 vec_shift
= gimple_build (stmts
, VEC_SERIES_EXPR
, vectype
,
9737 build_zero_cst (itype
), step_expr
);
9738 vec_init
= gimple_build (stmts
,
9739 (induction_type
== vect_step_op_shr
9740 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
9741 vectype
, vec_init
, vec_shift
);
9744 case vect_step_op_neg
:
9746 vec_init
= gimple_build_vector_from_val (stmts
,
9749 tree vec_neg
= gimple_build (stmts
, NEGATE_EXPR
,
9751 /* The encoding has 2 interleaved stepped patterns. */
9752 vec_perm_builder
sel (nunits
, 2, 3);
9754 for (i
= 0; i
< 3; i
++)
9757 sel
[2 * i
+ 1] = i
+ nunits
;
9759 vec_perm_indices
indices (sel
, 2, nunits
);
9760 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9761 fail when vec_init is const vector. In that situation vec_perm is not
9764 = vect_gen_perm_mask_any (vectype
, indices
);
9765 vec_init
= gimple_build (stmts
, VEC_PERM_EXPR
,
9772 case vect_step_op_mul
:
9774 /* Use unsigned mult to avoid UD integer overflow. */
9775 gcc_assert (nunits
.is_constant (&const_nunits
));
9776 tree utype
= unsigned_type_for (itype
);
9777 tree uvectype
= build_vector_type (utype
,
9778 TYPE_VECTOR_SUBPARTS (vectype
));
9779 new_name
= gimple_convert (stmts
, utype
, new_name
);
9780 vec_init
= gimple_build_vector_from_val (stmts
,
9783 tree_vector_builder
elts (uvectype
, const_nunits
, 1);
9784 tree elt_step
= build_one_cst (utype
);
9786 elts
.quick_push (elt_step
);
9787 for (i
= 1; i
< const_nunits
; i
++)
9789 /* Create: new_name_i = new_name + step_expr. */
9790 elt_step
= gimple_build (stmts
, MULT_EXPR
,
9791 utype
, elt_step
, step_expr
);
9792 elts
.quick_push (elt_step
);
9794 /* Create a vector from [new_name_0, new_name_1, ...,
9795 new_name_nunits-1]. */
9796 tree vec_mul
= gimple_build_vector (stmts
, &elts
);
9797 vec_init
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
9799 vec_init
= gimple_convert (stmts
, vectype
, vec_init
);
9810 /* Peel init_expr by skip_niter for induction_type. */
9812 vect_peel_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
9813 tree skip_niters
, tree step_expr
,
9814 enum vect_induction_op_type induction_type
)
9816 gcc_assert (TREE_CODE (skip_niters
) == INTEGER_CST
);
9817 tree type
= TREE_TYPE (init_expr
);
9818 unsigned prec
= TYPE_PRECISION (type
);
9819 switch (induction_type
)
9821 case vect_step_op_neg
:
9822 if (TREE_INT_CST_LOW (skip_niters
) % 2)
9823 init_expr
= gimple_build (stmts
, NEGATE_EXPR
, type
, init_expr
);
9824 /* else no change. */
9827 case vect_step_op_shr
:
9828 case vect_step_op_shl
:
9829 skip_niters
= gimple_convert (stmts
, type
, skip_niters
);
9830 step_expr
= gimple_build (stmts
, MULT_EXPR
, type
, step_expr
, skip_niters
);
9831 /* When shift mount >= precision, need to avoid UD.
9832 In the original loop, there's no UD, and according to semantic,
9833 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9834 if (!tree_fits_uhwi_p (step_expr
)
9835 || tree_to_uhwi (step_expr
) >= prec
)
9837 if (induction_type
== vect_step_op_shl
9838 || TYPE_UNSIGNED (type
))
9839 init_expr
= build_zero_cst (type
);
9841 init_expr
= gimple_build (stmts
, RSHIFT_EXPR
, type
,
9843 wide_int_to_tree (type
, prec
- 1));
9846 init_expr
= gimple_build (stmts
, (induction_type
== vect_step_op_shr
9847 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
9848 type
, init_expr
, step_expr
);
9851 case vect_step_op_mul
:
9853 tree utype
= unsigned_type_for (type
);
9854 init_expr
= gimple_convert (stmts
, utype
, init_expr
);
9855 wide_int skipn
= wi::to_wide (skip_niters
);
9856 wide_int begin
= wi::to_wide (step_expr
);
9857 auto_mpz base
, exp
, mod
, res
;
9858 wi::to_mpz (begin
, base
, TYPE_SIGN (type
));
9859 wi::to_mpz (skipn
, exp
, UNSIGNED
);
9860 mpz_ui_pow_ui (mod
, 2, TYPE_PRECISION (type
));
9861 mpz_powm (res
, base
, exp
, mod
);
9862 begin
= wi::from_mpz (utype
, res
, true);
9863 tree mult_expr
= wide_int_to_tree (utype
, begin
);
9864 init_expr
= gimple_build (stmts
, MULT_EXPR
, utype
,
9865 init_expr
, mult_expr
);
9866 init_expr
= gimple_convert (stmts
, type
, init_expr
);
9877 /* Create vector step for vectorized iv. */
9879 vect_create_nonlinear_iv_step (gimple_seq
* stmts
, tree step_expr
,
9881 enum vect_induction_op_type induction_type
)
9883 tree expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
9884 tree new_name
= NULL
;
9885 /* Step should be pow (step, vf) for mult induction. */
9886 if (induction_type
== vect_step_op_mul
)
9888 gcc_assert (vf
.is_constant ());
9889 wide_int begin
= wi::to_wide (step_expr
);
9891 for (unsigned i
= 0; i
!= vf
.to_constant () - 1; i
++)
9892 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
9894 new_name
= wide_int_to_tree (TREE_TYPE (step_expr
), begin
);
9896 else if (induction_type
== vect_step_op_neg
)
9900 new_name
= gimple_build (stmts
, MULT_EXPR
, TREE_TYPE (step_expr
),
9906 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo
,
9907 stmt_vec_info stmt_info
,
9908 tree new_name
, tree vectype
,
9909 enum vect_induction_op_type induction_type
)
9911 /* No step is needed for neg induction. */
9912 if (induction_type
== vect_step_op_neg
)
9915 tree t
= unshare_expr (new_name
);
9916 gcc_assert (CONSTANT_CLASS_P (new_name
)
9917 || TREE_CODE (new_name
) == SSA_NAME
);
9918 tree new_vec
= build_vector_from_val (vectype
, t
);
9919 tree vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9920 new_vec
, vectype
, NULL
);
9924 /* Update vectorized iv with vect_step, induc_def is init. */
9926 vect_update_nonlinear_iv (gimple_seq
* stmts
, tree vectype
,
9927 tree induc_def
, tree vec_step
,
9928 enum vect_induction_op_type induction_type
)
9930 tree vec_def
= induc_def
;
9931 switch (induction_type
)
9933 case vect_step_op_mul
:
9935 /* Use unsigned mult to avoid UD integer overflow. */
9937 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype
)),
9938 TYPE_VECTOR_SUBPARTS (vectype
));
9939 vec_def
= gimple_convert (stmts
, uvectype
, vec_def
);
9940 vec_step
= gimple_convert (stmts
, uvectype
, vec_step
);
9941 vec_def
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
9943 vec_def
= gimple_convert (stmts
, vectype
, vec_def
);
9947 case vect_step_op_shr
:
9948 vec_def
= gimple_build (stmts
, RSHIFT_EXPR
, vectype
,
9952 case vect_step_op_shl
:
9953 vec_def
= gimple_build (stmts
, LSHIFT_EXPR
, vectype
,
9956 case vect_step_op_neg
:
9957 vec_def
= induc_def
;
9968 /* Function vectorizable_induction
9970 Check if STMT_INFO performs an nonlinear induction computation that can be
9971 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9972 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9974 Return true if STMT_INFO is vectorizable in this way. */
9977 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo
,
9978 stmt_vec_info stmt_info
,
9979 gimple
**vec_stmt
, slp_tree slp_node
,
9980 stmt_vector_for_cost
*cost_vec
)
9982 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9984 bool nested_in_vect_loop
= false;
9985 class loop
*iv_loop
;
9987 edge pe
= loop_preheader_edge (loop
);
9989 tree vec_init
, vec_step
;
9992 gphi
*induction_phi
;
9993 tree induc_def
, vec_dest
;
9994 tree init_expr
, step_expr
;
9996 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9998 gimple_stmt_iterator si
;
10000 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
10002 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
10003 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
10004 enum vect_induction_op_type induction_type
10005 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
10007 gcc_assert (induction_type
> vect_step_op_add
);
10012 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
10013 gcc_assert (ncopies
>= 1);
10015 /* FORNOW. Only handle nonlinear induction in the same loop. */
10016 if (nested_in_vect_loop_p (loop
, stmt_info
))
10018 if (dump_enabled_p ())
10019 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10020 "nonlinear induction in nested loop.\n");
10025 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
10027 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
10028 update for each iv and a permutation to generate wanted vector iv. */
10031 if (dump_enabled_p ())
10032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10033 "SLP induction not supported for nonlinear"
10038 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype
)))
10040 if (dump_enabled_p ())
10041 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10042 "floating point nonlinear induction vectorization"
10043 " not supported.\n");
10047 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
10048 init_expr
= vect_phi_initial_value (phi
);
10049 gcc_assert (step_expr
!= NULL_TREE
&& init_expr
!= NULL
10050 && TREE_CODE (step_expr
) == INTEGER_CST
);
10051 /* step_expr should be aligned with init_expr,
10052 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
10053 step_expr
= fold_convert (TREE_TYPE (vectype
), step_expr
);
10055 if (TREE_CODE (init_expr
) == INTEGER_CST
)
10056 init_expr
= fold_convert (TREE_TYPE (vectype
), init_expr
);
10057 else if (!tree_nop_conversion_p (TREE_TYPE (vectype
), TREE_TYPE (init_expr
)))
10059 /* INIT_EXPR could be a bit_field, bail out for such case. */
10060 if (dump_enabled_p ())
10061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10062 "nonlinear induction vectorization failed:"
10063 " component type of vectype is not a nop conversion"
10064 " from type of init_expr.\n");
10068 switch (induction_type
)
10070 case vect_step_op_neg
:
10071 if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1u))
10073 if (TREE_CODE (init_expr
) != INTEGER_CST
10074 && TREE_CODE (init_expr
) != REAL_CST
)
10076 /* Check for backend support of NEGATE_EXPR and vec_perm. */
10077 if (!directly_supported_p (NEGATE_EXPR
, vectype
))
10080 /* The encoding has 2 interleaved stepped patterns. */
10081 vec_perm_builder
sel (nunits
, 2, 3);
10082 machine_mode mode
= TYPE_MODE (vectype
);
10083 sel
.quick_grow (6);
10084 for (i
= 0; i
< 3; i
++)
10087 sel
[i
* 2 + 1] = i
+ nunits
;
10089 vec_perm_indices
indices (sel
, 2, nunits
);
10090 if (!can_vec_perm_const_p (mode
, mode
, indices
))
10095 case vect_step_op_mul
:
10097 /* Check for backend support of MULT_EXPR. */
10098 if (!directly_supported_p (MULT_EXPR
, vectype
))
10101 /* ?? How to construct vector step for variable number vector.
10102 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
10103 if (!vf
.is_constant ())
10108 case vect_step_op_shr
:
10109 /* Check for backend support of RSHIFT_EXPR. */
10110 if (!directly_supported_p (RSHIFT_EXPR
, vectype
, optab_vector
))
10113 /* Don't shift more than type precision to avoid UD. */
10114 if (!tree_fits_uhwi_p (step_expr
)
10115 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
10116 TYPE_PRECISION (TREE_TYPE (init_expr
))))
10120 case vect_step_op_shl
:
10121 /* Check for backend support of RSHIFT_EXPR. */
10122 if (!directly_supported_p (LSHIFT_EXPR
, vectype
, optab_vector
))
10125 /* Don't shift more than type precision to avoid UD. */
10126 if (!tree_fits_uhwi_p (step_expr
)
10127 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
10128 TYPE_PRECISION (TREE_TYPE (init_expr
))))
10134 gcc_unreachable ();
10137 if (!vec_stmt
) /* transformation not required. */
10139 unsigned inside_cost
= 0, prologue_cost
= 0;
10140 /* loop cost for vec_loop. Neg induction doesn't have any
10142 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
10143 stmt_info
, 0, vect_body
);
10145 /* loop cost for vec_loop. Neg induction doesn't have any
10147 if (induction_type
== vect_step_op_neg
)
10150 /* prologue cost for vec_init and vec_step. */
10151 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
10152 stmt_info
, 0, vect_prologue
);
10154 if (dump_enabled_p ())
10155 dump_printf_loc (MSG_NOTE
, vect_location
,
10156 "vect_model_induction_cost: inside_cost = %d, "
10157 "prologue_cost = %d. \n", inside_cost
,
10160 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
10161 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
10167 /* Compute a vector variable, initialized with the first VF values of
10168 the induction variable. E.g., for an iv with IV_PHI='X' and
10169 evolution S, for a vector of 4 units, we want to compute:
10170 [X, X + S, X + 2*S, X + 3*S]. */
10172 if (dump_enabled_p ())
10173 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
10175 pe
= loop_preheader_edge (iv_loop
);
10176 /* Find the first insertion point in the BB. */
10177 basic_block bb
= gimple_bb (phi
);
10178 si
= gsi_after_labels (bb
);
10180 gimple_seq stmts
= NULL
;
10182 niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10183 /* If we are using the loop mask to "peel" for alignment then we need
10184 to adjust the start value here. */
10185 if (niters_skip
!= NULL_TREE
)
10186 init_expr
= vect_peel_nonlinear_iv_init (&stmts
, init_expr
, niters_skip
,
10187 step_expr
, induction_type
);
10189 vec_init
= vect_create_nonlinear_iv_init (&stmts
, init_expr
,
10190 step_expr
, nunits
, vectype
,
10194 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10195 gcc_assert (!new_bb
);
10199 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
10200 vf
, induction_type
);
10203 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10204 gcc_assert (!new_bb
);
10207 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
10210 /* Create the following def-use cycle:
10215 vec_iv = PHI <vec_init, vec_loop>
10219 vec_loop = vec_iv + vec_step; */
10221 /* Create the induction-phi that defines the induction-operand. */
10222 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
10223 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
10224 induc_def
= PHI_RESULT (induction_phi
);
10226 /* Create the iv update inside the loop. */
10228 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
10229 induc_def
, vec_step
,
10232 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10233 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
10235 /* Set the arguments of the phi node: */
10236 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
10237 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
10240 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
10241 *vec_stmt
= induction_phi
;
10243 /* In case that vectorization factor (VF) is bigger than the number
10244 of elements that we can fit in a vectype (nunits), we have to generate
10245 more than one vector stmt - i.e - we need to "unroll" the
10246 vector stmt by a factor VF/nunits. For more details see documentation
10247 in vectorizable_operation. */
10252 /* FORNOW. This restriction should be relaxed. */
10253 gcc_assert (!nested_in_vect_loop
);
10255 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
10256 nunits
, induction_type
);
10258 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
10261 vec_def
= induc_def
;
10262 for (i
= 1; i
< ncopies
; i
++)
10264 /* vec_i = vec_prev + vec_step. */
10266 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
10269 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10270 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
10271 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
10275 if (dump_enabled_p ())
10276 dump_printf_loc (MSG_NOTE
, vect_location
,
10277 "transform induction: created def-use cycle: %G%G",
10278 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
10283 /* Function vectorizable_induction
10285 Check if STMT_INFO performs an induction computation that can be vectorized.
10286 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
10287 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
10288 Return true if STMT_INFO is vectorizable in this way. */
10291 vectorizable_induction (loop_vec_info loop_vinfo
,
10292 stmt_vec_info stmt_info
,
10293 gimple
**vec_stmt
, slp_tree slp_node
,
10294 stmt_vector_for_cost
*cost_vec
)
10296 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10298 bool nested_in_vect_loop
= false;
10299 class loop
*iv_loop
;
10301 edge pe
= loop_preheader_edge (loop
);
10302 basic_block new_bb
;
10303 tree new_vec
, vec_init
= NULL_TREE
, vec_step
, t
;
10306 gphi
*induction_phi
;
10307 tree induc_def
, vec_dest
;
10308 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10311 gimple_stmt_iterator si
;
10312 enum vect_induction_op_type induction_type
10313 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
10315 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
10319 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
10322 /* Make sure it was recognized as induction computation. */
10323 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
10326 /* Handle nonlinear induction in a separate place. */
10327 if (induction_type
!= vect_step_op_add
)
10328 return vectorizable_nonlinear_induction (loop_vinfo
, stmt_info
,
10329 vec_stmt
, slp_node
, cost_vec
);
10331 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
10332 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
10337 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
10338 gcc_assert (ncopies
>= 1);
10340 /* FORNOW. These restrictions should be relaxed. */
10341 if (nested_in_vect_loop_p (loop
, stmt_info
))
10343 imm_use_iterator imm_iter
;
10344 use_operand_p use_p
;
10351 if (dump_enabled_p ())
10352 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10353 "multiple types in nested loop.\n");
10358 latch_e
= loop_latch_edge (loop
->inner
);
10359 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
10360 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
10362 gimple
*use_stmt
= USE_STMT (use_p
);
10363 if (is_gimple_debug (use_stmt
))
10366 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
10368 exit_phi
= use_stmt
;
10374 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
10375 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
10376 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
10378 if (dump_enabled_p ())
10379 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10380 "inner-loop induction only used outside "
10381 "of the outer vectorized loop.\n");
10386 nested_in_vect_loop
= true;
10387 iv_loop
= loop
->inner
;
10391 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
10393 if (slp_node
&& (!nunits
.is_constant () && SLP_TREE_LANES (slp_node
) != 1))
10395 /* The current SLP code creates the step value element-by-element. */
10396 if (dump_enabled_p ())
10397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10398 "SLP induction not supported for variable-length"
10403 if (FLOAT_TYPE_P (vectype
) && !param_vect_induction_float
)
10405 if (dump_enabled_p ())
10406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10407 "floating point induction vectorization disabled\n");
10411 tree step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
10412 gcc_assert (step_expr
!= NULL_TREE
);
10413 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
10414 && !type_has_mode_precision_p (TREE_TYPE (step_expr
)))
10416 if (dump_enabled_p ())
10417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10418 "bit-precision induction vectorization not "
10422 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
10424 /* Check for backend support of PLUS/MINUS_EXPR. */
10425 if (!directly_supported_p (PLUS_EXPR
, step_vectype
)
10426 || !directly_supported_p (MINUS_EXPR
, step_vectype
))
10429 if (!vec_stmt
) /* transformation not required. */
10431 unsigned inside_cost
= 0, prologue_cost
= 0;
10434 /* We eventually need to set a vector type on invariant
10438 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
10439 if (!vect_maybe_update_slp_op_vectype
10440 (child
, SLP_TREE_VECTYPE (slp_node
)))
10442 if (dump_enabled_p ())
10443 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10444 "incompatible vector types for "
10448 /* loop cost for vec_loop. */
10450 = record_stmt_cost (cost_vec
,
10451 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
10452 vector_stmt
, stmt_info
, 0, vect_body
);
10453 /* prologue cost for vec_init (if not nested) and step. */
10454 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
10456 stmt_info
, 0, vect_prologue
);
10458 else /* if (!slp_node) */
10460 /* loop cost for vec_loop. */
10461 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
10462 stmt_info
, 0, vect_body
);
10463 /* prologue cost for vec_init and vec_step. */
10464 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
10465 stmt_info
, 0, vect_prologue
);
10467 if (dump_enabled_p ())
10468 dump_printf_loc (MSG_NOTE
, vect_location
,
10469 "vect_model_induction_cost: inside_cost = %d, "
10470 "prologue_cost = %d .\n", inside_cost
,
10473 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
10474 DUMP_VECT_SCOPE ("vectorizable_induction");
10480 /* Compute a vector variable, initialized with the first VF values of
10481 the induction variable. E.g., for an iv with IV_PHI='X' and
10482 evolution S, for a vector of 4 units, we want to compute:
10483 [X, X + S, X + 2*S, X + 3*S]. */
10485 if (dump_enabled_p ())
10486 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
10488 pe
= loop_preheader_edge (iv_loop
);
10489 /* Find the first insertion point in the BB. */
10490 basic_block bb
= gimple_bb (phi
);
10491 si
= gsi_after_labels (bb
);
10493 /* For SLP induction we have to generate several IVs as for example
10494 with group size 3 we need
10495 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10496 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10499 /* The initial values are vectorized, but any lanes > group_size
10500 need adjustment. */
10502 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
10504 /* Gather steps. Since we do not vectorize inductions as
10505 cycles we have to reconstruct the step from SCEV data. */
10506 unsigned group_size
= SLP_TREE_LANES (slp_node
);
10507 tree
*steps
= XALLOCAVEC (tree
, group_size
);
10508 tree
*inits
= XALLOCAVEC (tree
, group_size
);
10509 stmt_vec_info phi_info
;
10510 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
10512 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
10514 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
10518 /* Now generate the IVs. */
10519 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
10520 gcc_assert (multiple_p (nunits
* nvects
, group_size
));
10522 unsigned HOST_WIDE_INT const_nunits
;
10523 if (nested_in_vect_loop
)
10525 else if (nunits
.is_constant (&const_nunits
))
10527 /* Compute the number of distinct IVs we need. First reduce
10528 group_size if it is a multiple of const_nunits so we get
10529 one IV for a group_size of 4 but const_nunits 2. */
10530 unsigned group_sizep
= group_size
;
10531 if (group_sizep
% const_nunits
== 0)
10532 group_sizep
= group_sizep
/ const_nunits
;
10533 nivs
= least_common_multiple (group_sizep
,
10534 const_nunits
) / const_nunits
;
10538 gcc_assert (SLP_TREE_LANES (slp_node
) == 1);
10541 gimple_seq init_stmts
= NULL
;
10542 tree stept
= TREE_TYPE (step_vectype
);
10543 tree lupdate_mul
= NULL_TREE
;
10544 if (!nested_in_vect_loop
)
10546 if (nunits
.is_constant (&const_nunits
))
10548 /* The number of iterations covered in one vector iteration. */
10549 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
10551 = build_vector_from_val (step_vectype
,
10552 SCALAR_FLOAT_TYPE_P (stept
)
10553 ? build_real_from_wide (stept
, lup_mul
,
10555 : build_int_cstu (stept
, lup_mul
));
10559 if (SCALAR_FLOAT_TYPE_P (stept
))
10561 tree tem
= build_int_cst (integer_type_node
, vf
);
10562 lupdate_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
,
10566 lupdate_mul
= build_int_cst (stept
, vf
);
10567 lupdate_mul
= gimple_build_vector_from_val (&init_stmts
,
10572 tree peel_mul
= NULL_TREE
;
10573 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
10575 if (SCALAR_FLOAT_TYPE_P (stept
))
10576 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
10577 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
10579 peel_mul
= gimple_convert (&init_stmts
, stept
,
10580 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
10581 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
10582 step_vectype
, peel_mul
);
10584 tree step_mul
= NULL_TREE
;
10586 auto_vec
<tree
> vec_steps
;
10587 for (ivn
= 0; ivn
< nivs
; ++ivn
)
10589 gimple_seq stmts
= NULL
;
10590 bool invariant
= true;
10591 if (nunits
.is_constant (&const_nunits
))
10593 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
10594 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
10595 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
10596 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
10598 /* The scalar steps of the IVs. */
10599 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
10600 elt
= gimple_convert (&init_stmts
,
10601 TREE_TYPE (step_vectype
), elt
);
10602 step_elts
.quick_push (elt
);
10605 /* The scalar inits of the IVs if not vectorized. */
10606 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
10607 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
10609 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
10610 TREE_TYPE (vectype
), elt
);
10611 init_elts
.quick_push (elt
);
10613 /* The number of steps to add to the initial values. */
10614 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
10615 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
10616 ? build_real_from_wide (stept
, mul_elt
,
10618 : build_int_cstu (stept
, mul_elt
));
10620 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
10621 step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
10623 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
10629 else if (INTEGRAL_TYPE_P (TREE_TYPE (steps
[0])))
10631 new_name
= gimple_convert (&init_stmts
, stept
, inits
[0]);
10632 /* Build the initial value directly as a VEC_SERIES_EXPR. */
10633 vec_init
= gimple_build (&init_stmts
, VEC_SERIES_EXPR
,
10634 step_vectype
, new_name
, steps
[0]);
10635 if (!useless_type_conversion_p (vectype
, step_vectype
))
10636 vec_init
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
10637 vectype
, vec_init
);
10642 [base, base, base, ...]
10643 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10644 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps
[0])));
10645 gcc_assert (flag_associative_math
);
10646 tree index
= build_index_vector (step_vectype
, 0, 1);
10647 new_name
= gimple_convert (&init_stmts
, TREE_TYPE (steps
[0]),
10649 tree base_vec
= gimple_build_vector_from_val (&init_stmts
,
10652 tree step_vec
= gimple_build_vector_from_val (&init_stmts
,
10655 vec_init
= gimple_build (&init_stmts
, FLOAT_EXPR
,
10656 step_vectype
, index
);
10657 vec_init
= gimple_build (&init_stmts
, MULT_EXPR
,
10658 step_vectype
, vec_init
, step_vec
);
10659 vec_init
= gimple_build (&init_stmts
, PLUS_EXPR
,
10660 step_vectype
, vec_init
, base_vec
);
10661 if (!useless_type_conversion_p (vectype
, step_vectype
))
10662 vec_init
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
10663 vectype
, vec_init
);
10665 /* iv_loop is nested in the loop to be vectorized. Generate:
10666 vec_step = [S, S, S, S] */
10667 t
= unshare_expr (steps
[0]);
10668 gcc_assert (CONSTANT_CLASS_P (t
)
10669 || TREE_CODE (t
) == SSA_NAME
);
10670 vec_step
= gimple_build_vector_from_val (&init_stmts
,
10673 vec_steps
.safe_push (vec_step
);
10677 step_mul
= peel_mul
;
10679 step_mul
= gimple_build (&init_stmts
,
10680 MINUS_EXPR
, step_vectype
,
10681 step_mul
, peel_mul
);
10684 /* Create the induction-phi that defines the induction-operand. */
10685 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
10687 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
10688 induc_def
= PHI_RESULT (induction_phi
);
10690 /* Create the iv update inside the loop */
10691 tree up
= vec_step
;
10694 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
))
10696 /* When we're using loop_len produced by SELEC_VL, the
10697 non-final iterations are not always processing VF
10698 elements. So vectorize induction variable instead of
10700 _21 = vect_vec_iv_.6_22 + { VF, ... };
10702 We should generate:
10704 _35 = .SELECT_VL (ivtmp_33, VF);
10705 vect_cst__22 = [vec_duplicate_expr] _35;
10706 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10707 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
10708 tree len
= vect_get_loop_len (loop_vinfo
, NULL
, lens
, 1,
10710 if (SCALAR_FLOAT_TYPE_P (stept
))
10711 expr
= gimple_build (&stmts
, FLOAT_EXPR
, stept
, len
);
10713 expr
= gimple_convert (&stmts
, stept
, len
);
10714 lupdate_mul
= gimple_build_vector_from_val (&stmts
,
10717 up
= gimple_build (&stmts
, MULT_EXPR
,
10718 step_vectype
, vec_step
, lupdate_mul
);
10721 up
= gimple_build (&init_stmts
,
10722 MULT_EXPR
, step_vectype
,
10723 vec_step
, lupdate_mul
);
10725 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
10726 vec_def
= gimple_build (&stmts
,
10727 PLUS_EXPR
, step_vectype
, vec_def
, up
);
10728 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
10729 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10730 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
10734 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
10735 if (!nested_in_vect_loop
10737 && !integer_zerop (step_mul
))
10739 gcc_assert (invariant
);
10740 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
10741 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
10742 vec_step
, step_mul
);
10743 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
10745 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
10748 /* Set the arguments of the phi node: */
10749 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
10751 slp_node
->push_vec_def (induction_phi
);
10753 if (!nested_in_vect_loop
)
10755 /* Fill up to the number of vectors we need for the whole group. */
10756 if (nunits
.is_constant (&const_nunits
))
10757 nivs
= least_common_multiple (group_size
,
10758 const_nunits
) / const_nunits
;
10761 vec_steps
.reserve (nivs
-ivn
);
10762 for (; ivn
< nivs
; ++ivn
)
10764 slp_node
->push_vec_def (SLP_TREE_VEC_DEFS (slp_node
)[0]);
10765 vec_steps
.quick_push (vec_steps
[0]);
10769 /* Re-use IVs when we can. We are generating further vector
10770 stmts by adding VF' * stride to the IVs generated above. */
10773 if (nunits
.is_constant (&const_nunits
))
10775 unsigned vfp
= (least_common_multiple (group_size
, const_nunits
)
10778 = build_vector_from_val (step_vectype
,
10779 SCALAR_FLOAT_TYPE_P (stept
)
10780 ? build_real_from_wide (stept
,
10782 : build_int_cstu (stept
, vfp
));
10786 if (SCALAR_FLOAT_TYPE_P (stept
))
10788 tree tem
= build_int_cst (integer_type_node
, nunits
);
10789 lupdate_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
,
10793 lupdate_mul
= build_int_cst (stept
, nunits
);
10794 lupdate_mul
= gimple_build_vector_from_val (&init_stmts
,
10798 for (; ivn
< nvects
; ++ivn
)
10801 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node
)[ivn
- nivs
]);
10802 tree def
= gimple_get_lhs (iv
);
10804 vec_steps
[ivn
- nivs
]
10805 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
10806 vec_steps
[ivn
- nivs
], lupdate_mul
);
10807 gimple_seq stmts
= NULL
;
10808 def
= gimple_convert (&stmts
, step_vectype
, def
);
10809 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
10810 def
, vec_steps
[ivn
% nivs
]);
10811 def
= gimple_convert (&stmts
, vectype
, def
);
10812 if (gimple_code (iv
) == GIMPLE_PHI
)
10813 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10816 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
10817 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
10819 slp_node
->push_vec_def (def
);
10823 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
10824 gcc_assert (!new_bb
);
10829 tree init_expr
= vect_phi_initial_value (phi
);
10831 gimple_seq stmts
= NULL
;
10832 if (!nested_in_vect_loop
)
10834 /* Convert the initial value to the IV update type. */
10835 tree new_type
= TREE_TYPE (step_expr
);
10836 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
10838 /* If we are using the loop mask to "peel" for alignment then we need
10839 to adjust the start value here. */
10840 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
10841 if (skip_niters
!= NULL_TREE
)
10843 if (FLOAT_TYPE_P (vectype
))
10844 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
10847 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
10848 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
10849 skip_niters
, step_expr
);
10850 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
10851 init_expr
, skip_step
);
10857 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10858 gcc_assert (!new_bb
);
10861 /* Create the vector that holds the initial_value of the induction. */
10862 if (nested_in_vect_loop
)
10864 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10865 been created during vectorization of previous stmts. We obtain it
10866 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10867 auto_vec
<tree
> vec_inits
;
10868 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
10869 init_expr
, &vec_inits
);
10870 vec_init
= vec_inits
[0];
10871 /* If the initial value is not of proper type, convert it. */
10872 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
10875 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
10879 build1 (VIEW_CONVERT_EXPR
, vectype
,
10881 vec_init
= gimple_assign_lhs (new_stmt
);
10882 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
10884 gcc_assert (!new_bb
);
10889 /* iv_loop is the loop to be vectorized. Create:
10890 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10892 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
10894 unsigned HOST_WIDE_INT const_nunits
;
10895 if (nunits
.is_constant (&const_nunits
))
10897 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
10898 elts
.quick_push (new_name
);
10899 for (i
= 1; i
< const_nunits
; i
++)
10901 /* Create: new_name_i = new_name + step_expr */
10902 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
10903 new_name
, step_expr
);
10904 elts
.quick_push (new_name
);
10906 /* Create a vector from [new_name_0, new_name_1, ...,
10907 new_name_nunits-1] */
10908 vec_init
= gimple_build_vector (&stmts
, &elts
);
10910 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
10911 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10912 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
10913 new_name
, step_expr
);
10917 [base, base, base, ...]
10918 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10919 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
10920 gcc_assert (flag_associative_math
);
10921 tree index
= build_index_vector (step_vectype
, 0, 1);
10922 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
10924 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
10926 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
10927 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
10928 vec_init
, step_vec
);
10929 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
10930 vec_init
, base_vec
);
10932 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
10936 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
10937 gcc_assert (!new_bb
);
10942 /* Create the vector that holds the step of the induction. */
10943 gimple_stmt_iterator
*step_iv_si
= NULL
;
10944 if (nested_in_vect_loop
)
10945 /* iv_loop is nested in the loop to be vectorized. Generate:
10946 vec_step = [S, S, S, S] */
10947 new_name
= step_expr
;
10948 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
))
10950 /* When we're using loop_len produced by SELEC_VL, the non-final
10951 iterations are not always processing VF elements. So vectorize
10952 induction variable instead of
10954 _21 = vect_vec_iv_.6_22 + { VF, ... };
10956 We should generate:
10958 _35 = .SELECT_VL (ivtmp_33, VF);
10959 vect_cst__22 = [vec_duplicate_expr] _35;
10960 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10961 gcc_assert (!slp_node
);
10962 gimple_seq seq
= NULL
;
10963 vec_loop_lens
*lens
= &LOOP_VINFO_LENS (loop_vinfo
);
10964 tree len
= vect_get_loop_len (loop_vinfo
, NULL
, lens
, 1, vectype
, 0, 0);
10965 expr
= force_gimple_operand (fold_convert (TREE_TYPE (step_expr
),
10966 unshare_expr (len
)),
10967 &seq
, true, NULL_TREE
);
10968 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
), expr
,
10970 gsi_insert_seq_before (&si
, seq
, GSI_SAME_STMT
);
10975 /* iv_loop is the loop to be vectorized. Generate:
10976 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10977 gimple_seq seq
= NULL
;
10978 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
10980 expr
= build_int_cst (integer_type_node
, vf
);
10981 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
10984 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
10985 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
10989 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
10990 gcc_assert (!new_bb
);
10994 t
= unshare_expr (new_name
);
10995 gcc_assert (CONSTANT_CLASS_P (new_name
)
10996 || TREE_CODE (new_name
) == SSA_NAME
);
10997 new_vec
= build_vector_from_val (step_vectype
, t
);
10998 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
10999 new_vec
, step_vectype
, step_iv_si
);
11002 /* Create the following def-use cycle:
11007 vec_iv = PHI <vec_init, vec_loop>
11011 vec_loop = vec_iv + vec_step; */
11013 /* Create the induction-phi that defines the induction-operand. */
11014 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
11015 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
11016 induc_def
= PHI_RESULT (induction_phi
);
11018 /* Create the iv update inside the loop */
11020 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
11021 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
11022 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
11023 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
11024 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
11026 /* Set the arguments of the phi node: */
11027 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
11028 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
11031 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
11032 *vec_stmt
= induction_phi
;
11034 /* In case that vectorization factor (VF) is bigger than the number
11035 of elements that we can fit in a vectype (nunits), we have to generate
11036 more than one vector stmt - i.e - we need to "unroll" the
11037 vector stmt by a factor VF/nunits. For more details see documentation
11038 in vectorizable_operation. */
11042 gimple_seq seq
= NULL
;
11043 /* FORNOW. This restriction should be relaxed. */
11044 gcc_assert (!nested_in_vect_loop
);
11045 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
11046 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo
));
11048 /* Create the vector that holds the step of the induction. */
11049 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
11051 expr
= build_int_cst (integer_type_node
, nunits
);
11052 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
11055 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
11056 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
11060 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
11061 gcc_assert (!new_bb
);
11064 t
= unshare_expr (new_name
);
11065 gcc_assert (CONSTANT_CLASS_P (new_name
)
11066 || TREE_CODE (new_name
) == SSA_NAME
);
11067 new_vec
= build_vector_from_val (step_vectype
, t
);
11068 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
11069 new_vec
, step_vectype
, NULL
);
11071 vec_def
= induc_def
;
11072 for (i
= 1; i
< ncopies
+ 1; i
++)
11074 /* vec_i = vec_prev + vec_step */
11075 gimple_seq stmts
= NULL
;
11076 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
11077 vec_def
= gimple_build (&stmts
,
11078 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
11079 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
11081 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
11084 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
11085 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
11089 /* vec_1 = vec_iv + (VF/n * S)
11090 vec_2 = vec_1 + (VF/n * S)
11092 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
11094 vec_n is used as vec_loop to save the large step register and
11095 related operations. */
11096 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
11102 if (dump_enabled_p ())
11103 dump_printf_loc (MSG_NOTE
, vect_location
,
11104 "transform induction: created def-use cycle: %G%G",
11105 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
11110 /* Function vectorizable_live_operation_1.
11112 helper function for vectorizable_live_operation. */
11115 vectorizable_live_operation_1 (loop_vec_info loop_vinfo
,
11116 stmt_vec_info stmt_info
, basic_block exit_bb
,
11117 tree vectype
, int ncopies
, slp_tree slp_node
,
11118 tree bitsize
, tree bitstart
, tree vec_lhs
,
11119 tree lhs_type
, gimple_stmt_iterator
*exit_gsi
)
11121 gcc_assert (single_pred_p (exit_bb
) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
));
11123 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
11124 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
11125 for (unsigned i
= 0; i
< gimple_phi_num_args (phi
); i
++)
11126 SET_PHI_ARG_DEF (phi
, i
, vec_lhs
);
11128 gimple_seq stmts
= NULL
;
11131 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
11132 if (integer_zerop (bitstart
))
11134 tree scalar_res
= gimple_build (&stmts
, BIT_FIELD_REF
, TREE_TYPE (vectype
),
11135 vec_lhs_phi
, bitsize
, bitstart
);
11137 /* Convert the extracted vector element to the scalar type. */
11138 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
11140 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
11144 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
11146 where VEC_LHS is the vectorized live-out result and MASK is
11147 the loop mask for the final iteration. */
11148 gcc_assert (ncopies
== 1
11149 && (!slp_node
|| SLP_TREE_LANES (slp_node
) == 1));
11150 gimple_seq tem
= NULL
;
11151 gimple_stmt_iterator gsi
= gsi_last (tem
);
11152 tree len
= vect_get_loop_len (loop_vinfo
, &gsi
,
11153 &LOOP_VINFO_LENS (loop_vinfo
),
11157 signed char biasval
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
11158 tree bias_minus_one
11159 = int_const_binop (MINUS_EXPR
,
11160 build_int_cst (TREE_TYPE (len
), biasval
),
11161 build_one_cst (TREE_TYPE (len
)));
11163 /* LAST_INDEX = LEN + (BIAS - 1). */
11164 tree last_index
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (len
),
11165 len
, bias_minus_one
);
11167 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
11169 = gimple_build (&stmts
, CFN_VEC_EXTRACT
, TREE_TYPE (vectype
),
11170 vec_lhs_phi
, last_index
);
11172 /* Convert the extracted vector element to the scalar type. */
11173 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
11175 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
11179 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
11181 where VEC_LHS is the vectorized live-out result and MASK is
11182 the loop mask for the final iteration. */
11183 gcc_assert (!slp_node
|| SLP_TREE_LANES (slp_node
) == 1);
11184 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
11185 gimple_seq tem
= NULL
;
11186 gimple_stmt_iterator gsi
= gsi_last (tem
);
11187 tree mask
= vect_get_loop_mask (loop_vinfo
, &gsi
,
11188 &LOOP_VINFO_MASKS (loop_vinfo
),
11191 gimple_seq_add_seq (&stmts
, tem
);
11193 scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
11194 mask
, vec_lhs_phi
);
11196 /* Convert the extracted vector element to the scalar type. */
11197 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
11201 tree bftype
= TREE_TYPE (vectype
);
11202 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
11203 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
11204 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs_phi
, bitsize
, bitstart
);
11205 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
11206 &stmts
, true, NULL_TREE
);
11209 *exit_gsi
= gsi_after_labels (exit_bb
);
11211 gsi_insert_seq_before (exit_gsi
, stmts
, GSI_SAME_STMT
);
11216 /* Function vectorizable_live_operation.
11218 STMT_INFO computes a value that is used outside the loop. Check if
11219 it can be supported. */
11222 vectorizable_live_operation (vec_info
*vinfo
, stmt_vec_info stmt_info
,
11223 slp_tree slp_node
, slp_instance slp_node_instance
,
11224 int slp_index
, bool vec_stmt_p
,
11225 stmt_vector_for_cost
*cost_vec
)
11227 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
11228 imm_use_iterator imm_iter
;
11229 tree lhs
, lhs_type
, bitsize
;
11230 tree vectype
= (slp_node
11231 ? SLP_TREE_VECTYPE (slp_node
)
11232 : STMT_VINFO_VECTYPE (stmt_info
));
11233 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
11236 use_operand_p use_p
;
11237 auto_vec
<tree
> vec_oprnds
;
11239 poly_uint64 vec_index
= 0;
11241 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
)
11242 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
));
11244 /* If a stmt of a reduction is live, vectorize it via
11245 vect_create_epilog_for_reduction. vectorizable_reduction assessed
11246 validity so just trigger the transform here. */
11247 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
11251 /* For SLP reductions we vectorize the epilogue for all involved stmts
11253 if (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
) && slp_index
!= 0)
11255 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
11256 gcc_assert (reduc_info
->is_reduc_info
);
11257 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
11258 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
11261 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo
)
11262 || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
11263 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
11265 LOOP_VINFO_IV_EXIT (loop_vinfo
));
11267 /* If early break we only have to materialize the reduction on the merge
11268 block, but we have to find an alternate exit first. */
11269 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
11271 slp_tree phis_node
= slp_node
? slp_node_instance
->reduc_phis
: NULL
;
11272 for (auto exit
: get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo
)))
11273 if (exit
!= LOOP_VINFO_IV_EXIT (loop_vinfo
))
11275 vect_create_epilog_for_reduction (loop_vinfo
, reduc_info
,
11276 phis_node
, slp_node_instance
,
11280 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
11281 vect_create_epilog_for_reduction (loop_vinfo
, reduc_info
,
11282 phis_node
, slp_node_instance
,
11283 LOOP_VINFO_IV_EXIT (loop_vinfo
));
11289 /* If STMT is not relevant and it is a simple assignment and its inputs are
11290 invariant then it can remain in place, unvectorized. The original last
11291 scalar value that it computes will be used. */
11292 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
11294 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
11295 if (dump_enabled_p ())
11296 dump_printf_loc (MSG_NOTE
, vect_location
,
11297 "statement is simple and uses invariant. Leaving in "
11305 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
11309 gcc_assert (slp_index
>= 0);
11311 /* Get the last occurrence of the scalar index from the concatenation of
11312 all the slp vectors. Calculate which slp vector it is and the index
11314 int num_scalar
= SLP_TREE_LANES (slp_node
);
11315 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
11316 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
11318 /* Calculate which vector contains the result, and which lane of
11319 that vector we need. */
11320 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
11322 if (dump_enabled_p ())
11323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11324 "Cannot determine which vector holds the"
11325 " final result.\n");
11332 /* No transformation required. */
11333 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
11335 if (slp_node
&& SLP_TREE_LANES (slp_node
) != 1)
11337 if (dump_enabled_p ())
11338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11339 "can't operate on partial vectors "
11340 "because an SLP statement is live after "
11342 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
11344 else if (ncopies
> 1
11345 || (slp_node
&& SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
) > 1))
11347 if (dump_enabled_p ())
11348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11349 "can't operate on partial vectors "
11350 "because ncopies is greater than 1.\n");
11351 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
11355 gcc_assert (ncopies
== 1
11356 && (!slp_node
|| SLP_TREE_LANES (slp_node
) == 1));
11357 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
11358 OPTIMIZE_FOR_SPEED
))
11359 vect_record_loop_mask (loop_vinfo
,
11360 &LOOP_VINFO_MASKS (loop_vinfo
),
11362 else if (can_vec_extract_var_idx_p (
11363 TYPE_MODE (vectype
), TYPE_MODE (TREE_TYPE (vectype
))))
11364 vect_record_loop_len (loop_vinfo
,
11365 &LOOP_VINFO_LENS (loop_vinfo
),
11369 if (dump_enabled_p ())
11371 MSG_MISSED_OPTIMIZATION
, vect_location
,
11372 "can't operate on partial vectors "
11373 "because the target doesn't support extract "
11374 "last reduction.\n");
11375 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
11379 /* ??? Enable for loop costing as well. */
11381 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
11386 /* Use the lhs of the original scalar statement. */
11387 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
11388 if (dump_enabled_p ())
11389 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
11392 lhs
= gimple_get_lhs (stmt
);
11393 lhs_type
= TREE_TYPE (lhs
);
11395 bitsize
= vector_element_bits_tree (vectype
);
11397 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
11398 tree vec_lhs
, vec_lhs0
, bitstart
;
11399 gimple
*vec_stmt
, *vec_stmt0
;
11402 gcc_assert (!loop_vinfo
11403 || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
11404 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
11405 || SLP_TREE_LANES (slp_node
) == 1));
11407 /* Get the correct slp vectorized stmt. */
11408 vec_lhs
= SLP_TREE_VEC_DEFS (slp_node
)[vec_entry
];
11409 vec_stmt
= SSA_NAME_DEF_STMT (vec_lhs
);
11411 /* In case we need to early break vectorize also get the first stmt. */
11412 vec_lhs0
= SLP_TREE_VEC_DEFS (slp_node
)[0];
11413 vec_stmt0
= SSA_NAME_DEF_STMT (vec_lhs0
);
11415 /* Get entry to use. */
11416 bitstart
= bitsize_int (vec_index
);
11417 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
11421 /* For multiple copies, get the last copy. */
11422 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
11423 vec_lhs
= gimple_get_lhs (vec_stmt
);
11425 /* In case we need to early break vectorize also get the first stmt. */
11426 vec_stmt0
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
11427 vec_lhs0
= gimple_get_lhs (vec_stmt0
);
11429 /* Get the last lane in the vector. */
11430 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
11435 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
11436 requirement, insert one phi node for it. It looks like:
11443 # vec_lhs' = PHI <vec_lhs>
11444 new_tree = lane_extract <vec_lhs', ...>;
11445 lhs' = new_tree; */
11447 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11448 /* Check if we have a loop where the chosen exit is not the main exit,
11449 in these cases for an early break we restart the iteration the vector code
11450 did. For the live values we want the value at the start of the iteration
11451 rather than at the end. */
11452 edge main_e
= LOOP_VINFO_IV_EXIT (loop_vinfo
);
11453 bool all_exits_as_early_p
= LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
);
11454 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
11455 if (!is_gimple_debug (use_stmt
)
11456 && !flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
11457 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
11459 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (use_stmt
),
11460 phi_arg_index_from_use (use_p
));
11461 gcc_assert (loop_exit_edge_p (loop
, e
));
11462 bool main_exit_edge
= e
== main_e
;
11463 tree tmp_vec_lhs
= vec_lhs
;
11464 tree tmp_bitstart
= bitstart
;
11466 /* For early exit where the exit is not in the BB that leads
11467 to the latch then we're restarting the iteration in the
11468 scalar loop. So get the first live value. */
11469 if ((all_exits_as_early_p
|| !main_exit_edge
)
11470 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
)
11472 tmp_vec_lhs
= vec_lhs0
;
11473 tmp_bitstart
= build_zero_cst (TREE_TYPE (bitstart
));
11476 gimple_stmt_iterator exit_gsi
;
11478 = vectorizable_live_operation_1 (loop_vinfo
, stmt_info
,
11479 e
->dest
, vectype
, ncopies
,
11481 tmp_bitstart
, tmp_vec_lhs
,
11482 lhs_type
, &exit_gsi
);
11484 auto gsi
= gsi_for_stmt (use_stmt
);
11485 tree lhs_phi
= gimple_phi_result (use_stmt
);
11486 remove_phi_node (&gsi
, false);
11487 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
11488 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
11492 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11493 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
11494 gcc_assert (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)));
11498 /* For basic-block vectorization simply insert the lane-extraction. */
11499 tree bftype
= TREE_TYPE (vectype
);
11500 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
11501 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
11502 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
11503 vec_lhs
, bitsize
, bitstart
);
11504 gimple_seq stmts
= NULL
;
11505 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
11506 &stmts
, true, NULL_TREE
);
11507 if (TREE_CODE (new_tree
) == SSA_NAME
11508 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
11509 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
11510 if (is_a
<gphi
*> (vec_stmt
))
11512 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
11513 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
11517 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
11518 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
11521 /* Replace use of lhs with newly computed result. If the use stmt is a
11522 single arg PHI, just replace all uses of PHI result. It's necessary
11523 because lcssa PHI defining lhs may be before newly inserted stmt. */
11524 use_operand_p use_p
;
11525 stmt_vec_info use_stmt_info
;
11526 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
11527 if (!is_gimple_debug (use_stmt
)
11528 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
11529 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
11531 /* ??? This can happen when the live lane ends up being
11532 rooted in a vector construction code-generated by an
11533 external SLP node (and code-generation for that already
11534 happened). See gcc.dg/vect/bb-slp-47.c.
11535 Doing this is what would happen if that vector CTOR
11536 were not code-generated yet so it is not too bad.
11537 ??? In fact we'd likely want to avoid this situation
11538 in the first place. */
11539 if (TREE_CODE (new_tree
) == SSA_NAME
11540 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
11541 && gimple_code (use_stmt
) != GIMPLE_PHI
11542 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
11545 if (dump_enabled_p ())
11546 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11547 "Using original scalar computation for "
11548 "live lane because use preceeds vector "
11552 /* ??? It can also happen that we end up pulling a def into
11553 a loop where replacing out-of-loop uses would require
11554 a new LC SSA PHI node. Retain the original scalar in
11555 those cases as well. PR98064. */
11556 if (TREE_CODE (new_tree
) == SSA_NAME
11557 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
11558 && (gimple_bb (use_stmt
)->loop_father
11559 != gimple_bb (vec_stmt
)->loop_father
)
11560 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
11561 gimple_bb (use_stmt
)->loop_father
))
11563 if (dump_enabled_p ())
11564 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
11565 "Using original scalar computation for "
11566 "live lane because there is an out-of-loop "
11567 "definition for it\n");
11570 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
11571 SET_USE (use_p
, new_tree
);
11572 update_stmt (use_stmt
);
11579 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11582 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
11584 ssa_op_iter op_iter
;
11585 imm_use_iterator imm_iter
;
11586 def_operand_p def_p
;
11589 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
11591 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
11595 if (!is_gimple_debug (ustmt
))
11598 bb
= gimple_bb (ustmt
);
11600 if (!flow_bb_inside_loop_p (loop
, bb
))
11602 if (gimple_debug_bind_p (ustmt
))
11604 if (dump_enabled_p ())
11605 dump_printf_loc (MSG_NOTE
, vect_location
,
11606 "killing debug use\n");
11608 gimple_debug_bind_reset_value (ustmt
);
11609 update_stmt (ustmt
);
11612 gcc_unreachable ();
11618 /* Given loop represented by LOOP_VINFO, return true if computation of
11619 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11623 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
11625 /* Constant case. */
11626 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
11628 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
11629 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
11631 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
11632 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
11633 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
11638 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11639 /* Check the upper bound of loop niters. */
11640 if (get_max_loop_iterations (loop
, &max
))
11642 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
11643 signop sgn
= TYPE_SIGN (type
);
11644 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
11645 if (max
< type_max
)
11651 /* Return a mask type with half the number of elements as OLD_TYPE,
11652 given that it should have mode NEW_MODE. */
11655 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
11657 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
11658 return build_truth_vector_type_for_mode (nunits
, new_mode
);
11661 /* Return a mask type with twice as many elements as OLD_TYPE,
11662 given that it should have mode NEW_MODE. */
11665 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
11667 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
11668 return build_truth_vector_type_for_mode (nunits
, new_mode
);
11671 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11672 contain a sequence of NVECTORS masks that each control a vector of type
11673 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11674 these vector masks with the vector version of SCALAR_MASK. */
11677 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
11678 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
11680 gcc_assert (nvectors
!= 0);
11684 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
11685 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
11688 masks
->mask_set
.add (std::make_pair (vectype
, nvectors
));
11691 /* Given a complete set of masks MASKS, extract mask number INDEX
11692 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11693 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11695 See the comment above vec_loop_masks for more details about the mask
11699 vect_get_loop_mask (loop_vec_info loop_vinfo
,
11700 gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
11701 unsigned int nvectors
, tree vectype
, unsigned int index
)
11703 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
11704 == vect_partial_vectors_while_ult
)
11706 rgroup_controls
*rgm
= &(masks
->rgc_vec
)[nvectors
- 1];
11707 tree mask_type
= rgm
->type
;
11709 /* Populate the rgroup's mask array, if this is the first time we've
11711 if (rgm
->controls
.is_empty ())
11713 rgm
->controls
.safe_grow_cleared (nvectors
, true);
11714 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11716 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
11717 /* Provide a dummy definition until the real one is available. */
11718 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
11719 rgm
->controls
[i
] = mask
;
11723 tree mask
= rgm
->controls
[index
];
11724 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
11725 TYPE_VECTOR_SUBPARTS (vectype
)))
11727 /* A loop mask for data type X can be reused for data type Y
11728 if X has N times more elements than Y and if Y's elements
11729 are N times bigger than X's. In this case each sequence
11730 of N elements in the loop mask will be all-zero or all-one.
11731 We can then view-convert the mask so that each sequence of
11732 N elements is replaced by a single element. */
11733 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
11734 TYPE_VECTOR_SUBPARTS (vectype
)));
11735 gimple_seq seq
= NULL
;
11736 mask_type
= truth_type_for (vectype
);
11737 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
11739 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11743 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo
)
11744 == vect_partial_vectors_avx512
)
11746 /* The number of scalars per iteration and the number of vectors are
11747 both compile-time constants. */
11748 unsigned int nscalars_per_iter
11749 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11750 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
11752 rgroup_controls
*rgm
= &masks
->rgc_vec
[nscalars_per_iter
- 1];
11754 /* The stored nV is dependent on the mask type produced. */
11755 gcc_assert (exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11756 TYPE_VECTOR_SUBPARTS (rgm
->type
)).to_constant ()
11758 nvectors
= rgm
->factor
;
11760 /* Populate the rgroup's mask array, if this is the first time we've
11762 if (rgm
->controls
.is_empty ())
11764 rgm
->controls
.safe_grow_cleared (nvectors
, true);
11765 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11767 tree mask
= make_temp_ssa_name (rgm
->type
, NULL
, "loop_mask");
11768 /* Provide a dummy definition until the real one is available. */
11769 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
11770 rgm
->controls
[i
] = mask
;
11773 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm
->type
),
11774 TYPE_VECTOR_SUBPARTS (vectype
)))
11775 return rgm
->controls
[index
];
11777 /* Split the vector if needed. Since we are dealing with integer mode
11778 masks with AVX512 we can operate on the integer representation
11779 performing the whole vector shifting. */
11780 unsigned HOST_WIDE_INT factor
;
11781 bool ok
= constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm
->type
),
11782 TYPE_VECTOR_SUBPARTS (vectype
), &factor
);
11784 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm
->type
)) == MODE_INT
);
11785 tree mask_type
= truth_type_for (vectype
);
11786 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type
)) == MODE_INT
);
11787 unsigned vi
= index
/ factor
;
11788 unsigned vpart
= index
% factor
;
11789 tree vec
= rgm
->controls
[vi
];
11790 gimple_seq seq
= NULL
;
11791 vec
= gimple_build (&seq
, VIEW_CONVERT_EXPR
,
11792 lang_hooks
.types
.type_for_mode
11793 (TYPE_MODE (rgm
->type
), 1), vec
);
11794 /* For integer mode masks simply shift the right bits into position. */
11796 vec
= gimple_build (&seq
, RSHIFT_EXPR
, TREE_TYPE (vec
), vec
,
11797 build_int_cst (integer_type_node
,
11798 (TYPE_VECTOR_SUBPARTS (vectype
)
11800 vec
= gimple_convert (&seq
, lang_hooks
.types
.type_for_mode
11801 (TYPE_MODE (mask_type
), 1), vec
);
11802 vec
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, vec
);
11804 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11808 gcc_unreachable ();
11811 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11812 lengths for controlling an operation on VECTYPE. The operation splits
11813 each element of VECTYPE into FACTOR separate subelements, measuring the
11814 length as a number of these subelements. */
11817 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
11818 unsigned int nvectors
, tree vectype
, unsigned int factor
)
11820 gcc_assert (nvectors
!= 0);
11821 if (lens
->length () < nvectors
)
11822 lens
->safe_grow_cleared (nvectors
, true);
11823 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
11825 /* The number of scalars per iteration, scalar occupied bytes and
11826 the number of vectors are both compile-time constants. */
11827 unsigned int nscalars_per_iter
11828 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
11829 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
11831 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
11833 /* For now, we only support cases in which all loads and stores fall back
11834 to VnQI or none do. */
11835 gcc_assert (!rgl
->max_nscalars_per_iter
11836 || (rgl
->factor
== 1 && factor
== 1)
11837 || (rgl
->max_nscalars_per_iter
* rgl
->factor
11838 == nscalars_per_iter
* factor
));
11839 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
11840 rgl
->type
= vectype
;
11841 rgl
->factor
= factor
;
11845 /* Given a complete set of lengths LENS, extract length number INDEX
11846 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11847 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11848 multipled by the number of elements that should be processed.
11849 Insert any set-up statements before GSI. */
11852 vect_get_loop_len (loop_vec_info loop_vinfo
, gimple_stmt_iterator
*gsi
,
11853 vec_loop_lens
*lens
, unsigned int nvectors
, tree vectype
,
11854 unsigned int index
, unsigned int factor
)
11856 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
11857 bool use_bias_adjusted_len
=
11858 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) != 0;
11860 /* Populate the rgroup's len array, if this is the first time we've
11862 if (rgl
->controls
.is_empty ())
11864 rgl
->controls
.safe_grow_cleared (nvectors
, true);
11865 for (unsigned int i
= 0; i
< nvectors
; ++i
)
11867 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
11868 gcc_assert (len_type
!= NULL_TREE
);
11870 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
11872 /* Provide a dummy definition until the real one is available. */
11873 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
11874 rgl
->controls
[i
] = len
;
11876 if (use_bias_adjusted_len
)
11878 gcc_assert (i
== 0);
11879 tree adjusted_len
=
11880 make_temp_ssa_name (len_type
, NULL
, "adjusted_loop_len");
11881 SSA_NAME_DEF_STMT (adjusted_len
) = gimple_build_nop ();
11882 rgl
->bias_adjusted_ctrl
= adjusted_len
;
11887 if (use_bias_adjusted_len
)
11888 return rgl
->bias_adjusted_ctrl
;
11890 tree loop_len
= rgl
->controls
[index
];
11891 if (rgl
->factor
== 1 && factor
== 1)
11893 poly_int64 nunits1
= TYPE_VECTOR_SUBPARTS (rgl
->type
);
11894 poly_int64 nunits2
= TYPE_VECTOR_SUBPARTS (vectype
);
11895 if (maybe_ne (nunits1
, nunits2
))
11897 /* A loop len for data type X can be reused for data type Y
11898 if X has N times more elements than Y and if Y's elements
11899 are N times bigger than X's. */
11900 gcc_assert (multiple_p (nunits1
, nunits2
));
11901 factor
= exact_div (nunits1
, nunits2
).to_constant ();
11902 tree iv_type
= LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
);
11903 gimple_seq seq
= NULL
;
11904 loop_len
= gimple_build (&seq
, RDIV_EXPR
, iv_type
, loop_len
,
11905 build_int_cst (iv_type
, factor
));
11907 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
11913 /* Generate the tree for the loop len mask and return it. Given the lens,
11914 nvectors, vectype, index and factor to gen the len mask as below.
11916 tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11919 vect_gen_loop_len_mask (loop_vec_info loop_vinfo
, gimple_stmt_iterator
*gsi
,
11920 gimple_stmt_iterator
*cond_gsi
, vec_loop_lens
*lens
,
11921 unsigned int nvectors
, tree vectype
, tree stmt
,
11922 unsigned int index
, unsigned int factor
)
11924 tree all_one_mask
= build_all_ones_cst (vectype
);
11925 tree all_zero_mask
= build_zero_cst (vectype
);
11926 tree len
= vect_get_loop_len (loop_vinfo
, gsi
, lens
, nvectors
, vectype
, index
,
11928 tree bias
= build_int_cst (intQI_type_node
,
11929 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
));
11930 tree len_mask
= make_temp_ssa_name (TREE_TYPE (stmt
), NULL
, "vec_len_mask");
11931 gcall
*call
= gimple_build_call_internal (IFN_VCOND_MASK_LEN
, 5, stmt
,
11932 all_one_mask
, all_zero_mask
, len
,
11934 gimple_call_set_lhs (call
, len_mask
);
11935 gsi_insert_before (cond_gsi
, call
, GSI_SAME_STMT
);
11940 /* Scale profiling counters by estimation for LOOP which is vectorized
11942 If FLAT is true, the loop we started with had unrealistically flat
11946 scale_profile_for_vect_loop (class loop
*loop
, edge exit_e
, unsigned vf
, bool flat
)
11948 /* For flat profiles do not scale down proportionally by VF and only
11949 cap by known iteration count bounds. */
11952 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
11953 fprintf (dump_file
,
11954 "Vectorized loop profile seems flat; not scaling iteration "
11955 "count down by the vectorization factor %i\n", vf
);
11956 scale_loop_profile (loop
, profile_probability::always (),
11957 get_likely_max_loop_iterations_int (loop
));
11960 /* Loop body executes VF fewer times and exit increases VF times. */
11961 profile_count entry_count
= loop_preheader_edge (loop
)->count ();
11963 /* If we have unreliable loop profile avoid dropping entry
11964 count bellow header count. This can happen since loops
11965 has unrealistically low trip counts. */
11967 && loop
->header
->count
> entry_count
11968 && loop
->header
->count
< entry_count
* vf
)
11970 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
11971 fprintf (dump_file
,
11972 "Vectorization factor %i seems too large for profile "
11973 "prevoiusly believed to be consistent; reducing.\n", vf
);
11977 if (entry_count
.nonzero_p ())
11978 set_edge_probability_and_rescale_others
11980 entry_count
.probability_in (loop
->header
->count
/ vf
));
11981 /* Avoid producing very large exit probability when we do not have
11982 sensible profile. */
11983 else if (exit_e
->probability
< profile_probability::always () / (vf
* 2))
11984 set_edge_probability_and_rescale_others (exit_e
, exit_e
->probability
* vf
);
11985 loop
->latch
->count
= single_pred_edge (loop
->latch
)->count ();
11987 scale_loop_profile (loop
, profile_probability::always () / vf
,
11988 get_likely_max_loop_iterations_int (loop
));
11991 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11992 latch edge values originally defined by it. */
11995 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
11996 stmt_vec_info def_stmt_info
)
11998 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
11999 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
12001 stmt_vec_info phi_info
;
12002 imm_use_iterator iter
;
12003 use_operand_p use_p
;
12004 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
12006 gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
));
12009 if (!(gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
12010 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
12011 && STMT_VINFO_RELEVANT_P (phi_info
)))
12013 loop_p loop
= gimple_bb (phi
)->loop_father
;
12014 edge e
= loop_latch_edge (loop
);
12015 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) != def
)
12018 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
12019 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
12020 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
12022 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
12023 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
12024 gcc_assert (phi_defs
.length () == latch_defs
.length ());
12025 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
12026 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
12027 gimple_get_lhs (latch_defs
[i
]), e
,
12028 gimple_phi_arg_location (phi
, e
->dest_idx
));
12030 else if (STMT_VINFO_DEF_TYPE (phi_info
) == vect_first_order_recurrence
)
12032 /* For first order recurrences we have to update both uses of
12033 the latch definition, the one in the PHI node and the one
12034 in the generated VEC_PERM_EXPR. */
12035 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
12036 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
12037 gcc_assert (phi_defs
.length () == latch_defs
.length ());
12038 tree phidef
= gimple_assign_rhs1 (phi_defs
[0]);
12039 gphi
*vphi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (phidef
));
12040 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
12042 gassign
*perm
= as_a
<gassign
*> (phi_defs
[i
]);
12044 gimple_assign_set_rhs1 (perm
, gimple_get_lhs (latch_defs
[i
-1]));
12045 gimple_assign_set_rhs2 (perm
, gimple_get_lhs (latch_defs
[i
]));
12046 update_stmt (perm
);
12048 add_phi_arg (vphi
, gimple_get_lhs (latch_defs
.last ()), e
,
12049 gimple_phi_arg_location (phi
, e
->dest_idx
));
12054 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
12055 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
12059 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
12060 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
12062 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
12063 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
12065 if (dump_enabled_p ())
12066 dump_printf_loc (MSG_NOTE
, vect_location
,
12067 "------>vectorizing statement: %G", stmt_info
->stmt
);
12069 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
12070 vect_loop_kill_debug_uses (loop
, stmt_info
);
12072 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
12073 && !STMT_VINFO_LIVE_P (stmt_info
))
12075 if (is_gimple_call (stmt_info
->stmt
)
12076 && gimple_call_internal_p (stmt_info
->stmt
, IFN_MASK_CALL
))
12078 gcc_assert (!gimple_call_lhs (stmt_info
->stmt
));
12079 *seen_store
= stmt_info
;
12085 if (STMT_VINFO_VECTYPE (stmt_info
))
12088 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
12089 if (!STMT_SLP_TYPE (stmt_info
)
12090 && maybe_ne (nunits
, vf
)
12091 && dump_enabled_p ())
12092 /* For SLP VF is set according to unrolling factor, and not
12093 to vector size, hence for SLP this print is not valid. */
12094 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
12097 /* Pure SLP statements have already been vectorized. We still need
12098 to apply loop vectorization to hybrid SLP statements. */
12099 if (PURE_SLP_STMT (stmt_info
))
12102 if (dump_enabled_p ())
12103 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
12105 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
12106 *seen_store
= stmt_info
;
12111 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
12112 in the hash_map with its corresponding values. */
12115 find_in_mapping (tree t
, void *context
)
12117 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
12119 tree
*value
= mapping
->get (t
);
12120 return value
? *value
: t
;
12123 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
12124 original loop that has now been vectorized.
12126 The inits of the data_references need to be advanced with the number of
12127 iterations of the main loop. This has been computed in vect_do_peeling and
12128 is stored in parameter ADVANCE. We first restore the data_references
12129 initial offset with the values recored in ORIG_DRS_INIT.
12131 Since the loop_vec_info of this EPILOGUE was constructed for the original
12132 loop, its stmt_vec_infos all point to the original statements. These need
12133 to be updated to point to their corresponding copies as well as the SSA_NAMES
12134 in their PATTERN_DEF_SEQs and RELATED_STMTs.
12136 The data_reference's connections also need to be updated. Their
12137 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
12138 stmt_vec_infos, their statements need to point to their corresponding copy,
12139 if they are gather loads or scatter stores then their reference needs to be
12140 updated to point to its corresponding copy. */
12143 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
12145 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
12146 auto_vec
<gimple
*> stmt_worklist
;
12147 hash_map
<tree
,tree
> mapping
;
12148 gimple
*orig_stmt
, *new_stmt
;
12149 gimple_stmt_iterator epilogue_gsi
;
12150 gphi_iterator epilogue_phi_gsi
;
12151 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
12152 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
12155 free (LOOP_VINFO_BBS (epilogue_vinfo
));
12156 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
12157 LOOP_VINFO_NBBS (epilogue_vinfo
) = epilogue
->num_nodes
;
12159 /* Advance data_reference's with the number of iterations of the previous
12160 loop and its prologue. */
12161 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
12164 /* The EPILOGUE loop is a copy of the original loop so they share the same
12165 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
12166 point to the copied statements. We also create a mapping of all LHS' in
12167 the original loop and all the LHS' in the EPILOGUE and create worklists to
12168 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
12169 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
12171 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
12172 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
12174 new_stmt
= epilogue_phi_gsi
.phi ();
12176 gcc_assert (gimple_uid (new_stmt
) > 0);
12178 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
12180 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
12181 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
12183 mapping
.put (gimple_phi_result (orig_stmt
),
12184 gimple_phi_result (new_stmt
));
12185 /* PHI nodes can not have patterns or related statements. */
12186 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
12187 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
12190 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
12191 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
12193 new_stmt
= gsi_stmt (epilogue_gsi
);
12194 if (is_gimple_debug (new_stmt
))
12197 gcc_assert (gimple_uid (new_stmt
) > 0);
12199 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
12201 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
12202 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
12204 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
12205 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
12207 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
12209 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
12210 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
12211 !gsi_end_p (gsi
); gsi_next (&gsi
))
12212 stmt_worklist
.safe_push (gsi_stmt (gsi
));
12215 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
12216 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
12218 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
12219 stmt_worklist
.safe_push (stmt
);
12220 /* Set BB such that the assert in
12221 'get_initial_def_for_reduction' is able to determine that
12222 the BB of the related stmt is inside this loop. */
12223 gimple_set_bb (stmt
,
12224 gimple_bb (new_stmt
));
12225 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
12226 gcc_assert (related_vinfo
== NULL
12227 || related_vinfo
== stmt_vinfo
);
12232 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
12233 using the original main loop and thus need to be updated to refer to the
12234 cloned variables used in the epilogue. */
12235 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
12237 gimple
*stmt
= stmt_worklist
[i
];
12240 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
12242 tree op
= gimple_op (stmt
, j
);
12243 if ((new_op
= mapping
.get(op
)))
12244 gimple_set_op (stmt
, j
, *new_op
);
12247 /* PR92429: The last argument of simplify_replace_tree disables
12248 folding when replacing arguments. This is required as
12249 otherwise you might end up with different statements than the
12250 ones analyzed in vect_loop_analyze, leading to different
12252 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
12253 &find_in_mapping
, &mapping
, false);
12254 gimple_set_op (stmt
, j
, op
);
12259 struct data_reference
*dr
;
12260 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
12261 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
12263 orig_stmt
= DR_STMT (dr
);
12264 gcc_assert (gimple_uid (orig_stmt
) > 0);
12265 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
12266 /* Data references for gather loads and scatter stores do not use the
12267 updated offset we set using ADVANCE. Instead we have to make sure the
12268 reference in the data references point to the corresponding copy of
12269 the original in the epilogue. Make sure to update both
12270 gather/scatters recognized by dataref analysis and also other
12271 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
12272 auto vstmt_vinfo
= vect_stmt_to_vectorize (stmt_vinfo
);
12273 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo
) == VMAT_GATHER_SCATTER
12274 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo
))
12277 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
12278 &find_in_mapping
, &mapping
);
12279 DR_BASE_ADDRESS (dr
)
12280 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
12281 &find_in_mapping
, &mapping
);
12283 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
12284 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
12287 epilogue_vinfo
->shared
->datarefs_copy
.release ();
12288 epilogue_vinfo
->shared
->save_datarefs ();
12291 /* When vectorizing early break statements instructions that happen before
12292 the early break in the current BB need to be moved to after the early
12293 break. This function deals with that and assumes that any validity
12294 checks has already been performed.
12296 While moving the instructions if it encounters a VUSE or VDEF it then
12297 corrects the VUSES as it moves the statements along. GDEST is the location
12298 in which to insert the new statements. */
12301 move_early_exit_stmts (loop_vec_info loop_vinfo
)
12303 DUMP_VECT_SCOPE ("move_early_exit_stmts");
12305 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).is_empty ())
12308 /* Move all stmts that need moving. */
12309 basic_block dest_bb
= LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
);
12310 gimple_stmt_iterator dest_gsi
= gsi_after_labels (dest_bb
);
12312 tree last_seen_vuse
= NULL_TREE
;
12313 for (gimple
*stmt
: LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
))
12315 /* We have to update crossed degenerate virtual PHIs. Simply
12317 if (gphi
*vphi
= dyn_cast
<gphi
*> (stmt
))
12319 tree vdef
= gimple_phi_result (vphi
);
12320 tree vuse
= gimple_phi_arg_def (vphi
, 0);
12321 imm_use_iterator iter
;
12322 use_operand_p use_p
;
12324 FOR_EACH_IMM_USE_STMT (use_stmt
, iter
, vdef
)
12326 FOR_EACH_IMM_USE_ON_STMT (use_p
, iter
)
12327 SET_USE (use_p
, vuse
);
12329 auto gsi
= gsi_for_stmt (stmt
);
12330 remove_phi_node (&gsi
, true);
12331 last_seen_vuse
= vuse
;
12335 /* Check to see if statement is still required for vect or has been
12337 auto stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
12341 if (dump_enabled_p ())
12342 dump_printf_loc (MSG_NOTE
, vect_location
, "moving stmt %G", stmt
);
12344 gimple_stmt_iterator stmt_gsi
= gsi_for_stmt (stmt
);
12345 gsi_move_before (&stmt_gsi
, &dest_gsi
, GSI_NEW_STMT
);
12346 last_seen_vuse
= gimple_vuse (stmt
);
12349 /* Update all the stmts with their new reaching VUSES. */
12350 for (auto p
: LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
))
12352 if (dump_enabled_p ())
12353 dump_printf_loc (MSG_NOTE
, vect_location
,
12354 "updating vuse to %T for load %G",
12355 last_seen_vuse
, p
);
12356 gimple_set_vuse (p
, last_seen_vuse
);
12360 /* And update the LC PHIs on exits. */
12361 for (edge e
: get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo
)))
12362 if (!dominated_by_p (CDI_DOMINATORS
, e
->src
, dest_bb
))
12363 if (gphi
*phi
= get_virtual_phi (e
->dest
))
12364 SET_PHI_ARG_DEF_ON_EDGE (phi
, e
, last_seen_vuse
);
12367 /* Function vect_transform_loop.
12369 The analysis phase has determined that the loop is vectorizable.
12370 Vectorize the loop - created vectorized stmts to replace the scalar
12371 stmts in the loop, and update the loop exit condition.
12372 Returns scalar epilogue loop if any. */
12375 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
12377 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
12378 class loop
*epilogue
= NULL
;
12379 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
12380 int nbbs
= loop
->num_nodes
;
12382 tree niters_vector
= NULL_TREE
;
12383 tree step_vector
= NULL_TREE
;
12384 tree niters_vector_mult_vf
= NULL_TREE
;
12385 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
12386 unsigned int lowest_vf
= constant_lower_bound (vf
);
12388 bool check_profitability
= false;
12390 bool flat
= maybe_flat_loop_profile (loop
);
12392 DUMP_VECT_SCOPE ("vec_transform_loop");
12394 loop_vinfo
->shared
->check_datarefs ();
12396 /* Use the more conservative vectorization threshold. If the number
12397 of iterations is constant assume the cost check has been performed
12398 by our caller. If the threshold makes all loops profitable that
12399 run at least the (estimated) vectorization factor number of times
12400 checking is pointless, too. */
12401 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
12402 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
12404 if (dump_enabled_p ())
12405 dump_printf_loc (MSG_NOTE
, vect_location
,
12406 "Profitability threshold is %d loop iterations.\n",
12408 check_profitability
= true;
12411 /* Make sure there exists a single-predecessor exit bb. Do this before
12413 edge e
= LOOP_VINFO_IV_EXIT (loop_vinfo
);
12414 if (! single_pred_p (e
->dest
) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
12416 split_loop_exit_edge (e
, true);
12417 if (dump_enabled_p ())
12418 dump_printf (MSG_NOTE
, "split exit edge\n");
12421 /* Version the loop first, if required, so the profitability check
12424 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
12427 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
12428 sloop
->force_vectorize
= false;
12429 check_profitability
= false;
12432 /* Make sure there exists a single-predecessor exit bb also on the
12433 scalar loop copy. Do this after versioning but before peeling
12434 so CFG structure is fine for both scalar and if-converted loop
12435 to make slpeel_duplicate_current_defs_from_edges face matched
12436 loop closed PHI nodes on the exit. */
12437 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
12439 e
= LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo
);
12440 if (! single_pred_p (e
->dest
))
12442 split_loop_exit_edge (e
, true);
12443 if (dump_enabled_p ())
12444 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
12448 tree niters
= vect_build_loop_niters (loop_vinfo
);
12449 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
12450 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
12451 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
12453 drs_init_vec orig_drs_init
;
12455 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
12456 &step_vector
, &niters_vector_mult_vf
, th
,
12457 check_profitability
, niters_no_overflow
,
12459 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
12460 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
12462 /* Ifcvt duplicates loop preheader, loop body and produces an basic
12463 block after loop exit. We need to scale all that. */
12464 basic_block preheader
12465 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))->src
;
12467 = preheader
->count
.apply_probability
12468 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
12469 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
12470 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
12471 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo
)->dest
->count
= preheader
->count
;
12474 if (niters_vector
== NULL_TREE
)
12476 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
12477 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
12478 && known_eq (lowest_vf
, vf
))
12481 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
12482 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
12483 step_vector
= build_one_cst (TREE_TYPE (niters
));
12485 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
12486 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
12487 &step_vector
, niters_no_overflow
);
12489 /* vect_do_peeling subtracted the number of peeled prologue
12490 iterations from LOOP_VINFO_NITERS. */
12491 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
12492 &niters_vector
, &step_vector
,
12493 niters_no_overflow
);
12496 /* 1) Make sure the loop header has exactly two entries
12497 2) Make sure we have a preheader basic block. */
12499 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
12501 split_edge (loop_preheader_edge (loop
));
12503 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
12504 /* This will deal with any possible peeling. */
12505 vect_prepare_for_masked_peels (loop_vinfo
);
12507 /* Handle any code motion that we need to for early-break vectorization after
12508 we've done peeling but just before we start vectorizing. */
12509 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
12510 move_early_exit_stmts (loop_vinfo
);
12512 /* Schedule the SLP instances first, then handle loop vectorization
12514 if (!loop_vinfo
->slp_instances
.is_empty ())
12516 DUMP_VECT_SCOPE ("scheduling SLP instances");
12517 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
12520 /* Generate the loop invariant statements. */
12521 if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo
)))
12523 if (dump_enabled_p ())
12524 dump_printf_loc (MSG_NOTE
, vect_location
,
12525 "------>generating loop invariant statements\n");
12526 gimple_stmt_iterator gsi
;
12527 gsi
= gsi_after_labels (loop_preheader_edge (loop
)->src
);
12528 gsi_insert_seq_before (&gsi
, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo
),
12529 GSI_CONTINUE_LINKING
);
12532 /* FORNOW: the vectorizer supports only loops which body consist
12533 of one basic block (header + empty latch). When the vectorizer will
12534 support more involved loop forms, the order by which the BBs are
12535 traversed need to be reconsidered. */
12537 for (i
= 0; i
< nbbs
; i
++)
12539 basic_block bb
= bbs
[i
];
12540 stmt_vec_info stmt_info
;
12542 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
12545 gphi
*phi
= si
.phi ();
12546 if (dump_enabled_p ())
12547 dump_printf_loc (MSG_NOTE
, vect_location
,
12548 "------>vectorizing phi: %G", (gimple
*) phi
);
12549 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
12553 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
12554 vect_loop_kill_debug_uses (loop
, stmt_info
);
12556 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
12557 && !STMT_VINFO_LIVE_P (stmt_info
))
12560 if (STMT_VINFO_VECTYPE (stmt_info
)
12562 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
12563 && dump_enabled_p ())
12564 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
12566 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
12567 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
12568 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
12569 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
12570 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
12571 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
12572 && ! PURE_SLP_STMT (stmt_info
))
12574 if (dump_enabled_p ())
12575 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
12576 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
12580 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
12583 gphi
*phi
= si
.phi ();
12584 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
12588 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
12589 && !STMT_VINFO_LIVE_P (stmt_info
))
12592 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
12593 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
12594 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
12595 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
12596 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
12597 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
)
12598 && ! PURE_SLP_STMT (stmt_info
))
12599 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
12602 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
12605 stmt
= gsi_stmt (si
);
12606 /* During vectorization remove existing clobber stmts and
12608 if (gimple_clobber_p (stmt
)
12609 || gimple_call_builtin_p (stmt
, BUILT_IN_PREFETCH
))
12611 unlink_stmt_vdef (stmt
);
12612 gsi_remove (&si
, true);
12613 release_defs (stmt
);
12617 /* Ignore vector stmts created in the outer loop. */
12618 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
12620 /* vector stmts created in the outer-loop during vectorization of
12621 stmts in an inner-loop may not have a stmt_info, and do not
12622 need to be vectorized. */
12623 stmt_vec_info seen_store
= NULL
;
12626 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
12628 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
12629 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
12630 !gsi_end_p (subsi
); gsi_next (&subsi
))
12632 stmt_vec_info pat_stmt_info
12633 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
12634 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
12637 stmt_vec_info pat_stmt_info
12638 = STMT_VINFO_RELATED_STMT (stmt_info
);
12639 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
12641 maybe_set_vectorized_backedge_value (loop_vinfo
,
12646 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
12648 maybe_set_vectorized_backedge_value (loop_vinfo
,
12655 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
12656 /* Interleaving. If IS_STORE is TRUE, the
12657 vectorization of the interleaving chain was
12658 completed - free all the stores in the chain. */
12659 vect_remove_stores (loop_vinfo
,
12660 DR_GROUP_FIRST_ELEMENT (seen_store
));
12662 /* Free the attached stmt_vec_info and remove the stmt. */
12663 loop_vinfo
->remove_stmt (stmt_info
);
12668 /* Stub out scalar statements that must not survive vectorization.
12669 Doing this here helps with grouped statements, or statements that
12670 are involved in patterns. */
12671 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
12672 !gsi_end_p (gsi
); gsi_next (&gsi
))
12674 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
12675 if (!call
|| !gimple_call_internal_p (call
))
12677 internal_fn ifn
= gimple_call_internal_fn (call
);
12678 if (ifn
== IFN_MASK_LOAD
)
12680 tree lhs
= gimple_get_lhs (call
);
12681 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12683 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
12684 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
12685 gsi_replace (&gsi
, new_stmt
, true);
12688 else if (conditional_internal_fn_code (ifn
) != ERROR_MARK
)
12690 tree lhs
= gimple_get_lhs (call
);
12691 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12694 = gimple_call_arg (call
, gimple_call_num_args (call
) - 1);
12695 gimple
*new_stmt
= gimple_build_assign (lhs
, else_arg
);
12696 gsi_replace (&gsi
, new_stmt
, true);
12700 } /* BBs in loop */
12702 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12703 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12704 if (integer_onep (step_vector
))
12705 niters_no_overflow
= true;
12706 vect_set_loop_condition (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
), loop_vinfo
,
12707 niters_vector
, step_vector
, niters_vector_mult_vf
,
12708 !niters_no_overflow
);
12710 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
12712 /* True if the final iteration might not handle a full vector's
12713 worth of scalar iterations. */
12714 bool final_iter_may_be_partial
12715 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
12716 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo
);
12718 /* +1 to convert latch counts to loop iteration counts. */
12719 int bias_for_lowest
= 1;
12721 /* When we are peeling for gaps then we take away one scalar iteration
12722 from the vector loop. Thus we can adjust the upper bound by one
12723 scalar iteration. But only when we know the bound applies to the
12724 IV exit test which might not be true when we have multiple exits. */
12725 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
12726 bias_for_lowest
-= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
12728 int bias_for_assumed
= bias_for_lowest
;
12729 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
12730 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
12732 /* When the amount of peeling is known at compile time, the first
12733 iteration will have exactly alignment_npeels active elements.
12734 In the worst case it will have at least one. */
12735 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
12736 bias_for_lowest
+= lowest_vf
- min_first_active
;
12737 bias_for_assumed
+= assumed_vf
- min_first_active
;
12739 /* In these calculations the "- 1" converts loop iteration counts
12740 back to latch counts. */
12741 if (loop
->any_upper_bound
)
12743 loop_vec_info main_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
12744 loop
->nb_iterations_upper_bound
12745 = (final_iter_may_be_partial
12746 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
12748 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
12751 /* Both peeling for alignment and peeling for gaps can end up
12752 with the scalar epilogue running for more than VF-1 iterations. */
12753 && !main_vinfo
->peeling_for_alignment
12754 && !main_vinfo
->peeling_for_gaps
)
12756 unsigned int bound
;
12757 poly_uint64 main_iters
12758 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo
),
12759 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo
));
12761 = upper_bound (main_iters
,
12762 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo
));
12763 if (can_div_away_from_zero_p (main_iters
,
12764 LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
12766 loop
->nb_iterations_upper_bound
12767 = wi::umin ((bound_wide_int
) (bound
- 1),
12768 loop
->nb_iterations_upper_bound
);
12771 if (loop
->any_likely_upper_bound
)
12772 loop
->nb_iterations_likely_upper_bound
12773 = (final_iter_may_be_partial
12774 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
12775 + bias_for_lowest
, lowest_vf
) - 1
12776 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
12777 + bias_for_lowest
, lowest_vf
) - 1);
12778 if (loop
->any_estimate
)
12779 loop
->nb_iterations_estimate
12780 = (final_iter_may_be_partial
12781 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
12783 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
12785 scale_profile_for_vect_loop (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
),
12788 if (dump_enabled_p ())
12790 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
12792 dump_printf_loc (MSG_NOTE
, vect_location
,
12793 "LOOP VECTORIZED\n");
12795 dump_printf_loc (MSG_NOTE
, vect_location
,
12796 "OUTER LOOP VECTORIZED\n");
12797 dump_printf (MSG_NOTE
, "\n");
12800 dump_printf_loc (MSG_NOTE
, vect_location
,
12801 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12802 GET_MODE_NAME (loop_vinfo
->vector_mode
));
12805 /* Loops vectorized with a variable factor won't benefit from
12806 unrolling/peeling. */
12807 if (!vf
.is_constant ())
12810 if (dump_enabled_p ())
12811 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
12812 " variable-length vectorization factor\n");
12814 /* Free SLP instances here because otherwise stmt reference counting
12816 slp_instance instance
;
12817 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
12818 vect_free_slp_instance (instance
);
12819 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
12820 /* Clear-up safelen field since its value is invalid after vectorization
12821 since vectorized loop can have loop-carried dependencies. */
12826 update_epilogue_loop_vinfo (epilogue
, advance
);
12828 epilogue
->simduid
= loop
->simduid
;
12829 epilogue
->force_vectorize
= loop
->force_vectorize
;
12830 epilogue
->dont_vectorize
= false;
12836 /* The code below is trying to perform simple optimization - revert
12837 if-conversion for masked stores, i.e. if the mask of a store is zero
12838 do not perform it and all stored value producers also if possible.
12840 for (i=0; i<n; i++)
12846 this transformation will produce the following semi-hammock:
12848 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12850 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12851 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12852 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12853 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12854 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12855 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12860 optimize_mask_stores (class loop
*loop
)
12862 basic_block
*bbs
= get_loop_body (loop
);
12863 unsigned nbbs
= loop
->num_nodes
;
12866 class loop
*bb_loop
;
12867 gimple_stmt_iterator gsi
;
12869 auto_vec
<gimple
*> worklist
;
12870 auto_purge_vect_location sentinel
;
12872 vect_location
= find_loop_location (loop
);
12873 /* Pick up all masked stores in loop if any. */
12874 for (i
= 0; i
< nbbs
; i
++)
12877 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
12880 stmt
= gsi_stmt (gsi
);
12881 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
12882 worklist
.safe_push (stmt
);
12887 if (worklist
.is_empty ())
12890 /* Loop has masked stores. */
12891 while (!worklist
.is_empty ())
12893 gimple
*last
, *last_store
;
12896 basic_block store_bb
, join_bb
;
12897 gimple_stmt_iterator gsi_to
;
12898 tree vdef
, new_vdef
;
12903 last
= worklist
.pop ();
12904 mask
= gimple_call_arg (last
, 2);
12905 bb
= gimple_bb (last
);
12906 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12907 the same loop as if_bb. It could be different to LOOP when two
12908 level loop-nest is vectorized and mask_store belongs to the inner
12910 e
= split_block (bb
, last
);
12911 bb_loop
= bb
->loop_father
;
12912 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
12914 store_bb
= create_empty_bb (bb
);
12915 add_bb_to_loop (store_bb
, bb_loop
);
12916 e
->flags
= EDGE_TRUE_VALUE
;
12917 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
12918 /* Put STORE_BB to likely part. */
12919 efalse
->probability
= profile_probability::likely ();
12920 e
->probability
= efalse
->probability
.invert ();
12921 store_bb
->count
= efalse
->count ();
12922 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
12923 if (dom_info_available_p (CDI_DOMINATORS
))
12924 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
12925 if (dump_enabled_p ())
12926 dump_printf_loc (MSG_NOTE
, vect_location
,
12927 "Create new block %d to sink mask stores.",
12929 /* Create vector comparison with boolean result. */
12930 vectype
= TREE_TYPE (mask
);
12931 zero
= build_zero_cst (vectype
);
12932 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
12933 gsi
= gsi_last_bb (bb
);
12934 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
12935 /* Create new PHI node for vdef of the last masked store:
12936 .MEM_2 = VDEF <.MEM_1>
12937 will be converted to
12938 .MEM.3 = VDEF <.MEM_1>
12939 and new PHI node will be created in join bb
12940 .MEM_2 = PHI <.MEM_1, .MEM_3>
12942 vdef
= gimple_vdef (last
);
12943 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
12944 gimple_set_vdef (last
, new_vdef
);
12945 phi
= create_phi_node (vdef
, join_bb
);
12946 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
12948 /* Put all masked stores with the same mask to STORE_BB if possible. */
12951 gimple_stmt_iterator gsi_from
;
12952 gimple
*stmt1
= NULL
;
12954 /* Move masked store to STORE_BB. */
12956 gsi
= gsi_for_stmt (last
);
12958 /* Shift GSI to the previous stmt for further traversal. */
12960 gsi_to
= gsi_start_bb (store_bb
);
12961 gsi_move_before (&gsi_from
, &gsi_to
);
12962 /* Setup GSI_TO to the non-empty block start. */
12963 gsi_to
= gsi_start_bb (store_bb
);
12964 if (dump_enabled_p ())
12965 dump_printf_loc (MSG_NOTE
, vect_location
,
12966 "Move stmt to created bb\n%G", last
);
12967 /* Move all stored value producers if possible. */
12968 while (!gsi_end_p (gsi
))
12971 imm_use_iterator imm_iter
;
12972 use_operand_p use_p
;
12975 /* Skip debug statements. */
12976 if (is_gimple_debug (gsi_stmt (gsi
)))
12981 stmt1
= gsi_stmt (gsi
);
12982 /* Do not consider statements writing to memory or having
12983 volatile operand. */
12984 if (gimple_vdef (stmt1
)
12985 || gimple_has_volatile_ops (stmt1
))
12989 lhs
= gimple_get_lhs (stmt1
);
12993 /* LHS of vectorized stmt must be SSA_NAME. */
12994 if (TREE_CODE (lhs
) != SSA_NAME
)
12997 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
12999 /* Remove dead scalar statement. */
13000 if (has_zero_uses (lhs
))
13002 gsi_remove (&gsi_from
, true);
13003 release_defs (stmt1
);
13008 /* Check that LHS does not have uses outside of STORE_BB. */
13010 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
13013 use_stmt
= USE_STMT (use_p
);
13014 if (is_gimple_debug (use_stmt
))
13016 if (gimple_bb (use_stmt
) != store_bb
)
13025 if (gimple_vuse (stmt1
)
13026 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
13029 /* Can move STMT1 to STORE_BB. */
13030 if (dump_enabled_p ())
13031 dump_printf_loc (MSG_NOTE
, vect_location
,
13032 "Move stmt to created bb\n%G", stmt1
);
13033 gsi_move_before (&gsi_from
, &gsi_to
);
13034 /* Shift GSI_TO for further insertion. */
13035 gsi_prev (&gsi_to
);
13037 /* Put other masked stores with the same mask to STORE_BB. */
13038 if (worklist
.is_empty ()
13039 || gimple_call_arg (worklist
.last (), 2) != mask
13040 || worklist
.last () != stmt1
)
13042 last
= worklist
.pop ();
13044 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
13048 /* Decide whether it is possible to use a zero-based induction variable
13049 when vectorizing LOOP_VINFO with partial vectors. If it is, return
13050 the value that the induction variable must be able to hold in order
13051 to ensure that the rgroups eventually have no active vector elements.
13052 Return -1 otherwise. */
13055 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
13057 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
13058 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
13059 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
13061 /* Calculate the value that the induction variable must be able
13062 to hit in order to ensure that we end the loop with an all-false mask.
13063 This involves adding the maximum number of inactive trailing scalar
13065 widest_int iv_limit
= -1;
13066 if (max_loop_iterations (loop
, &iv_limit
))
13070 /* Add the maximum number of skipped iterations to the
13071 maximum iteration count. */
13072 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
13073 iv_limit
+= wi::to_widest (niters_skip
);
13075 iv_limit
+= max_vf
- 1;
13077 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
13078 /* Make a conservatively-correct assumption. */
13079 iv_limit
+= max_vf
- 1;
13081 /* IV_LIMIT is the maximum number of latch iterations, which is also
13082 the maximum in-range IV value. Round this value down to the previous
13083 vector alignment boundary and then add an extra full iteration. */
13084 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
13085 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
13090 /* For the given rgroup_controls RGC, check whether an induction variable
13091 would ever hit a value that produces a set of all-false masks or zero
13092 lengths before wrapping around. Return true if it's possible to wrap
13093 around before hitting the desirable value, otherwise return false. */
13096 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
13098 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
13100 if (iv_limit
== -1)
13103 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
13104 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
13105 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
13107 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)