libstdc++: Refactor loops in std::__platform_semaphore
[official-gcc.git] / gcc / tree-vect-loop.cc
blobcc15492f6a01328f62bffcb6d060c201555e8ddd
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #define INCLUDE_MEMORY
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "gimple.h"
32 #include "cfghooks.h"
33 #include "tree-pass.h"
34 #include "ssa.h"
35 #include "optabs-tree.h"
36 #include "memmodel.h"
37 #include "optabs.h"
38 #include "diagnostic-core.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
41 #include "cfganal.h"
42 #include "gimplify.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop-niter.h"
48 #include "tree-ssa-loop.h"
49 #include "cfgloop.h"
50 #include "tree-scalar-evolution.h"
51 #include "tree-vectorizer.h"
52 #include "gimple-fold.h"
53 #include "cgraph.h"
54 #include "tree-cfg.h"
55 #include "tree-if-conv.h"
56 #include "internal-fn.h"
57 #include "tree-vector-builder.h"
58 #include "vec-perm-indices.h"
59 #include "tree-eh.h"
60 #include "case-cfn-macros.h"
61 #include "langhooks.h"
63 /* Loop Vectorization Pass.
65 This pass tries to vectorize loops.
67 For example, the vectorizer transforms the following simple loop:
69 short a[N]; short b[N]; short c[N]; int i;
71 for (i=0; i<N; i++){
72 a[i] = b[i] + c[i];
75 as if it was manually vectorized by rewriting the source code into:
77 typedef int __attribute__((mode(V8HI))) v8hi;
78 short a[N]; short b[N]; short c[N]; int i;
79 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 v8hi va, vb, vc;
82 for (i=0; i<N/8; i++){
83 vb = pb[i];
84 vc = pc[i];
85 va = vb + vc;
86 pa[i] = va;
89 The main entry to this pass is vectorize_loops(), in which
90 the vectorizer applies a set of analyses on a given set of loops,
91 followed by the actual vectorization transformation for the loops that
92 had successfully passed the analysis phase.
93 Throughout this pass we make a distinction between two types of
94 data: scalars (which are represented by SSA_NAMES), and memory references
95 ("data-refs"). These two types of data require different handling both
96 during analysis and transformation. The types of data-refs that the
97 vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 accesses are required to have a simple (consecutive) access pattern.
101 Analysis phase:
102 ===============
103 The driver for the analysis phase is vect_analyze_loop().
104 It applies a set of analyses, some of which rely on the scalar evolution
105 analyzer (scev) developed by Sebastian Pop.
107 During the analysis phase the vectorizer records some information
108 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 loop, as well as general information about the loop as a whole, which is
110 recorded in a "loop_vec_info" struct attached to each loop.
112 Transformation phase:
113 =====================
114 The loop transformation phase scans all the stmts in the loop, and
115 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 the loop that needs to be vectorized. It inserts the vector code sequence
117 just before the scalar stmt S, and records a pointer to the vector code
118 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 attached to S). This pointer will be used for the vectorization of following
120 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 otherwise, we rely on dead code elimination for removing it.
123 For example, say stmt S1 was vectorized into stmt VS1:
125 VS1: vb = px[i];
126 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 S2: a = b;
129 To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 resulting sequence would be:
134 VS1: vb = px[i];
135 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 VS2: va = vb;
137 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 Operands that are not SSA_NAMEs, are data-refs that appear in
140 load/store operations (like 'x[i]' in S1), and are handled differently.
142 Target modeling:
143 =================
144 Currently the only target specific information that is used is the
145 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 Targets that can support different sizes of vectors, for now will need
147 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 flexibility will be added in the future.
150 Since we only vectorize operations which vector form can be
151 expressed using existing tree codes, to verify that an operation is
152 supported, the vectorizer checks the relevant optab at the relevant
153 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 the value found is CODE_FOR_nothing, then there's no target support, and
155 we can't vectorize the stmt.
157 For additional information on this project see:
158 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
161 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 unsigned *);
163 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 bool *, bool *, bool);
166 /* Subroutine of vect_determine_vf_for_stmt that handles only one
167 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
168 may already be set for general statements (not just data refs). */
170 static opt_result
171 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
172 bool vectype_maybe_set_p,
173 poly_uint64 *vf)
175 gimple *stmt = stmt_info->stmt;
177 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
178 && !STMT_VINFO_LIVE_P (stmt_info))
179 || gimple_clobber_p (stmt))
181 if (dump_enabled_p ())
182 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
183 return opt_result::success ();
186 tree stmt_vectype, nunits_vectype;
187 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
188 &stmt_vectype,
189 &nunits_vectype);
190 if (!res)
191 return res;
193 if (stmt_vectype)
195 if (STMT_VINFO_VECTYPE (stmt_info))
196 /* The only case when a vectype had been already set is for stmts
197 that contain a data ref, or for "pattern-stmts" (stmts generated
198 by the vectorizer to represent/replace a certain idiom). */
199 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
200 || vectype_maybe_set_p)
201 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
202 else
203 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
206 if (nunits_vectype)
207 vect_update_max_nunits (vf, nunits_vectype);
209 return opt_result::success ();
212 /* Subroutine of vect_determine_vectorization_factor. Set the vector
213 types of STMT_INFO and all attached pattern statements and update
214 the vectorization factor VF accordingly. Return true on success
215 or false if something prevented vectorization. */
217 static opt_result
218 vect_determine_vf_for_stmt (vec_info *vinfo,
219 stmt_vec_info stmt_info, poly_uint64 *vf)
221 if (dump_enabled_p ())
222 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
223 stmt_info->stmt);
224 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
225 if (!res)
226 return res;
228 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
229 && STMT_VINFO_RELATED_STMT (stmt_info))
231 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
232 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
234 /* If a pattern statement has def stmts, analyze them too. */
235 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
236 !gsi_end_p (si); gsi_next (&si))
238 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
239 if (dump_enabled_p ())
240 dump_printf_loc (MSG_NOTE, vect_location,
241 "==> examining pattern def stmt: %G",
242 def_stmt_info->stmt);
243 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
244 if (!res)
245 return res;
248 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location,
250 "==> examining pattern statement: %G",
251 stmt_info->stmt);
252 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
253 if (!res)
254 return res;
257 return opt_result::success ();
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
271 in the loop.
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
274 original loop:
275 for (i=0; i<N; i++){
276 a[i] = b[i] + c[i];
279 vectorized loop:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
285 static opt_result
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
289 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
290 unsigned nbbs = loop->num_nodes;
291 poly_uint64 vectorization_factor = 1;
292 tree scalar_type = NULL_TREE;
293 gphi *phi;
294 tree vectype;
295 stmt_vec_info stmt_info;
296 unsigned i;
298 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
300 for (i = 0; i < nbbs; i++)
302 basic_block bb = bbs[i];
304 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
305 gsi_next (&si))
307 phi = si.phi ();
308 stmt_info = loop_vinfo->lookup_stmt (phi);
309 if (dump_enabled_p ())
310 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
311 (gimple *) phi);
313 gcc_assert (stmt_info);
315 if (STMT_VINFO_RELEVANT_P (stmt_info)
316 || STMT_VINFO_LIVE_P (stmt_info))
318 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
319 scalar_type = TREE_TYPE (PHI_RESULT (phi));
321 if (dump_enabled_p ())
322 dump_printf_loc (MSG_NOTE, vect_location,
323 "get vectype for scalar type: %T\n",
324 scalar_type);
326 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
327 if (!vectype)
328 return opt_result::failure_at (phi,
329 "not vectorized: unsupported "
330 "data-type %T\n",
331 scalar_type);
332 STMT_VINFO_VECTYPE (stmt_info) = vectype;
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
336 vectype);
338 if (dump_enabled_p ())
340 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
341 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
342 dump_printf (MSG_NOTE, "\n");
345 vect_update_max_nunits (&vectorization_factor, vectype);
349 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
350 gsi_next (&si))
352 if (is_gimple_debug (gsi_stmt (si)))
353 continue;
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (loop_vinfo,
357 stmt_info, &vectorization_factor);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375 return opt_result::success ();
379 /* Function vect_is_simple_iv_evolution.
381 FORNOW: A simple evolution of an induction variables in the loop is
382 considered a polynomial evolution. */
384 static bool
385 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
386 tree * step)
388 tree init_expr;
389 tree step_expr;
390 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
391 basic_block bb;
393 /* When there is no evolution in this loop, the evolution function
394 is not "simple". */
395 if (evolution_part == NULL_TREE)
396 return false;
398 /* When the evolution is a polynomial of degree >= 2
399 the evolution function is not "simple". */
400 if (tree_is_chrec (evolution_part))
401 return false;
403 step_expr = evolution_part;
404 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
406 if (dump_enabled_p ())
407 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
408 step_expr, init_expr);
410 *init = init_expr;
411 *step = step_expr;
413 if (TREE_CODE (step_expr) != INTEGER_CST
414 && (TREE_CODE (step_expr) != SSA_NAME
415 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
416 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
417 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
418 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
419 || !flag_associative_math)))
420 && (TREE_CODE (step_expr) != REAL_CST
421 || !flag_associative_math))
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "step unknown.\n");
426 return false;
429 return true;
432 /* Function vect_is_nonlinear_iv_evolution
434 Only support nonlinear induction for integer type
435 1. neg
436 2. mul by constant
437 3. lshift/rshift by constant.
439 For neg induction, return a fake step as integer -1. */
440 static bool
441 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
442 gphi* loop_phi_node, tree *init, tree *step)
444 tree init_expr, ev_expr, result, op1, op2;
445 gimple* def;
447 if (gimple_phi_num_args (loop_phi_node) != 2)
448 return false;
450 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
451 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
453 /* Support nonlinear induction only for integer type. */
454 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
455 return false;
457 *init = init_expr;
458 result = PHI_RESULT (loop_phi_node);
460 if (TREE_CODE (ev_expr) != SSA_NAME
461 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
462 || !is_gimple_assign (def))
463 return false;
465 enum tree_code t_code = gimple_assign_rhs_code (def);
466 switch (t_code)
468 case NEGATE_EXPR:
469 if (gimple_assign_rhs1 (def) != result)
470 return false;
471 *step = build_int_cst (TREE_TYPE (init_expr), -1);
472 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
473 break;
475 case RSHIFT_EXPR:
476 case LSHIFT_EXPR:
477 case MULT_EXPR:
478 op1 = gimple_assign_rhs1 (def);
479 op2 = gimple_assign_rhs2 (def);
480 if (TREE_CODE (op2) != INTEGER_CST
481 || op1 != result)
482 return false;
483 *step = op2;
484 if (t_code == LSHIFT_EXPR)
485 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
486 else if (t_code == RSHIFT_EXPR)
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
488 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
489 else
490 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
491 break;
493 default:
494 return false;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
498 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
500 return true;
503 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
504 what we are assuming is a double reduction. For example, given
505 a structure like this:
507 outer1:
508 x_1 = PHI <x_4(outer2), ...>;
511 inner:
512 x_2 = PHI <x_1(outer1), ...>;
514 x_3 = ...;
517 outer2:
518 x_4 = PHI <x_3(inner)>;
521 outer loop analysis would treat x_1 as a double reduction phi and
522 this function would then return true for x_2. */
524 static bool
525 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
527 use_operand_p use_p;
528 ssa_op_iter op_iter;
529 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
530 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
531 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
532 return true;
533 return false;
536 /* Returns true if Phi is a first-order recurrence. A first-order
537 recurrence is a non-reduction recurrence relation in which the value of
538 the recurrence in the current loop iteration equals a value defined in
539 the previous iteration. */
541 static bool
542 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
543 gphi *phi)
545 /* A nested cycle isn't vectorizable as first order recurrence. */
546 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
547 return false;
549 /* Ensure the loop latch definition is from within the loop. */
550 edge latch = loop_latch_edge (loop);
551 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
552 if (TREE_CODE (ldef) != SSA_NAME
553 || SSA_NAME_IS_DEFAULT_DEF (ldef)
554 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
555 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
556 return false;
558 tree def = gimple_phi_result (phi);
560 /* Ensure every use_stmt of the phi node is dominated by the latch
561 definition. */
562 imm_use_iterator imm_iter;
563 use_operand_p use_p;
564 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
565 if (!is_gimple_debug (USE_STMT (use_p))
566 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
567 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
568 USE_STMT (use_p))))
569 return false;
571 /* First-order recurrence autovectorization needs shuffle vector. */
572 tree scalar_type = TREE_TYPE (def);
573 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
574 if (!vectype)
575 return false;
577 return true;
580 /* Function vect_analyze_scalar_cycles_1.
582 Examine the cross iteration def-use cycles of scalar variables
583 in LOOP. LOOP_VINFO represents the loop that is now being
584 considered for vectorization (can be LOOP, or an outer-loop
585 enclosing LOOP). SLP indicates there will be some subsequent
586 slp analyses or not. */
588 static void
589 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
590 bool slp)
592 basic_block bb = loop->header;
593 tree init, step;
594 auto_vec<stmt_vec_info, 64> worklist;
595 gphi_iterator gsi;
596 bool double_reduc, reduc_chain;
598 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
600 /* First - identify all inductions. Reduction detection assumes that all the
601 inductions have been identified, therefore, this order must not be
602 changed. */
603 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
605 gphi *phi = gsi.phi ();
606 tree access_fn = NULL;
607 tree def = PHI_RESULT (phi);
608 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
610 if (dump_enabled_p ())
611 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
612 (gimple *) phi);
614 /* Skip virtual phi's. The data dependences that are associated with
615 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
616 if (virtual_operand_p (def))
617 continue;
619 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
621 /* Analyze the evolution function. */
622 access_fn = analyze_scalar_evolution (loop, def);
623 if (access_fn)
625 STRIP_NOPS (access_fn);
626 if (dump_enabled_p ())
627 dump_printf_loc (MSG_NOTE, vect_location,
628 "Access function of PHI: %T\n", access_fn);
629 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
630 = initial_condition_in_loop_num (access_fn, loop->num);
631 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
632 = evolution_part_in_loop_num (access_fn, loop->num);
635 if ((!access_fn
636 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
637 || !vect_is_simple_iv_evolution (loop->num, access_fn,
638 &init, &step)
639 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
640 && TREE_CODE (step) != INTEGER_CST))
641 /* Only handle nonlinear iv for same loop. */
642 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
643 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
644 phi, &init, &step)))
646 worklist.safe_push (stmt_vinfo);
647 continue;
650 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
651 != NULL_TREE);
652 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
656 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
660 /* Second - identify all reductions and nested cycles. */
661 while (worklist.length () > 0)
663 stmt_vec_info stmt_vinfo = worklist.pop ();
664 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
665 tree def = PHI_RESULT (phi);
667 if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
669 (gimple *) phi);
671 gcc_assert (!virtual_operand_p (def)
672 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
674 stmt_vec_info reduc_stmt_info
675 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
676 &reduc_chain, slp);
677 if (reduc_stmt_info)
679 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
680 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
681 if (double_reduc)
683 if (dump_enabled_p ())
684 dump_printf_loc (MSG_NOTE, vect_location,
685 "Detected double reduction.\n");
687 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
688 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 /* Make it accessible for SLP vectorization. */
690 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
692 else
694 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE, vect_location,
698 "Detected vectorizable nested cycle.\n");
700 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
702 else
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_NOTE, vect_location,
706 "Detected reduction.\n");
708 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
709 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
710 /* Store the reduction cycles for possible vectorization in
711 loop-aware SLP if it was not detected as reduction
712 chain. */
713 if (! reduc_chain)
714 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
715 (reduc_stmt_info);
719 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
720 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
721 else
722 if (dump_enabled_p ())
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
724 "Unknown def-use cycle pattern.\n");
729 /* Function vect_analyze_scalar_cycles.
731 Examine the cross iteration def-use cycles of scalar variables, by
732 analyzing the loop-header PHIs of scalar variables. Classify each
733 cycle as one of the following: invariant, induction, reduction, unknown.
734 We do that for the loop represented by LOOP_VINFO, and also to its
735 inner-loop, if exists.
736 Examples for scalar cycles:
738 Example1: reduction:
740 loop1:
741 for (i=0; i<N; i++)
742 sum += a[i];
744 Example2: induction:
746 loop2:
747 for (i=0; i<N; i++)
748 a[i] = i; */
750 static void
751 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
753 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
755 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
757 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
758 Reductions in such inner-loop therefore have different properties than
759 the reductions in the nest that gets vectorized:
760 1. When vectorized, they are executed in the same order as in the original
761 scalar loop, so we can't change the order of computation when
762 vectorizing them.
763 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
764 current checks are too strict. */
766 if (loop->inner)
767 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
770 /* Transfer group and reduction information from STMT_INFO to its
771 pattern stmt. */
773 static void
774 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
776 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
777 stmt_vec_info stmtp;
778 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
779 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
780 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
783 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
784 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
785 == STMT_VINFO_DEF_TYPE (stmt_info));
786 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
787 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
788 if (stmt_info)
789 REDUC_GROUP_NEXT_ELEMENT (stmtp)
790 = STMT_VINFO_RELATED_STMT (stmt_info);
792 while (stmt_info);
795 /* Fixup scalar cycles that now have their stmts detected as patterns. */
797 static void
798 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
800 stmt_vec_info first;
801 unsigned i;
803 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
805 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
806 while (next)
808 if ((STMT_VINFO_IN_PATTERN_P (next)
809 != STMT_VINFO_IN_PATTERN_P (first))
810 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
811 break;
812 next = REDUC_GROUP_NEXT_ELEMENT (next);
814 /* If all reduction chain members are well-formed patterns adjust
815 the group to group the pattern stmts instead. */
816 if (! next
817 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
819 if (STMT_VINFO_IN_PATTERN_P (first))
821 vect_fixup_reduc_chain (first);
822 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
823 = STMT_VINFO_RELATED_STMT (first);
826 /* If not all stmt in the chain are patterns or if we failed
827 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
828 it as regular reduction instead. */
829 else
831 stmt_vec_info vinfo = first;
832 stmt_vec_info last = NULL;
833 while (vinfo)
835 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
836 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
837 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
838 last = vinfo;
839 vinfo = next;
841 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
842 = vect_internal_def;
843 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
844 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
845 --i;
850 /* Function vect_get_loop_niters.
852 Determine how many iterations the loop is executed and place it
853 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
854 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
855 niter information holds in ASSUMPTIONS.
857 Return the loop exit conditions. */
860 static vec<gcond *>
861 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
862 tree *number_of_iterations, tree *number_of_iterationsm1)
864 auto_vec<edge> exits = get_loop_exit_edges (loop);
865 vec<gcond *> conds;
866 conds.create (exits.length ());
867 class tree_niter_desc niter_desc;
868 tree niter_assumptions, niter, may_be_zero;
870 *assumptions = boolean_true_node;
871 *number_of_iterationsm1 = chrec_dont_know;
872 *number_of_iterations = chrec_dont_know;
874 DUMP_VECT_SCOPE ("get_loop_niters");
876 if (exits.is_empty ())
877 return conds;
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
881 exits.length ());
883 edge exit;
884 unsigned int i;
885 FOR_EACH_VEC_ELT (exits, i, exit)
887 gcond *cond = get_loop_exit_condition (exit);
888 if (cond)
889 conds.safe_push (cond);
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
894 if (exit != main_exit)
895 continue;
897 may_be_zero = NULL_TREE;
898 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
899 || chrec_contains_undetermined (niter_desc.niter))
900 continue;
902 niter_assumptions = niter_desc.assumptions;
903 may_be_zero = niter_desc.may_be_zero;
904 niter = niter_desc.niter;
906 if (may_be_zero && integer_zerop (may_be_zero))
907 may_be_zero = NULL_TREE;
909 if (may_be_zero)
911 if (COMPARISON_CLASS_P (may_be_zero))
913 /* Try to combine may_be_zero with assumptions, this can simplify
914 computation of niter expression. */
915 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
916 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
917 niter_assumptions,
918 fold_build1 (TRUTH_NOT_EXPR,
919 boolean_type_node,
920 may_be_zero));
921 else
922 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
923 build_int_cst (TREE_TYPE (niter), 0),
924 rewrite_to_non_trapping_overflow (niter));
926 may_be_zero = NULL_TREE;
928 else if (integer_nonzerop (may_be_zero))
930 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
931 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
932 continue;
934 else
935 continue;
938 /* Loop assumptions are based off the normal exit. */
939 *assumptions = niter_assumptions;
940 *number_of_iterationsm1 = niter;
942 /* We want the number of loop header executions which is the number
943 of latch executions plus one.
944 ??? For UINT_MAX latch executions this number overflows to zero
945 for loops like do { n++; } while (n != 0); */
946 if (niter && !chrec_contains_undetermined (niter))
948 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
949 unshare_expr (niter),
950 build_int_cst (TREE_TYPE (niter), 1));
951 if (TREE_CODE (niter) == INTEGER_CST
952 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
954 /* If we manage to fold niter + 1 into INTEGER_CST even when
955 niter is some complex expression, ensure back
956 *number_of_iterationsm1 is an INTEGER_CST as well. See
957 PR113210. */
958 *number_of_iterationsm1
959 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
960 build_minus_one_cst (TREE_TYPE (niter)));
963 *number_of_iterations = niter;
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
969 return conds;
972 /* Determine the main loop exit for the vectorizer. */
974 edge
975 vec_init_loop_exit_info (class loop *loop)
977 /* Before we begin we must first determine which exit is the main one and
978 which are auxilary exits. */
979 auto_vec<edge> exits = get_loop_exit_edges (loop);
980 if (exits.length () == 1)
981 return exits[0];
983 /* If we have multiple exits we only support counting IV at the moment.
984 Analyze all exits and return the last one we can analyze. */
985 class tree_niter_desc niter_desc;
986 edge candidate = NULL;
987 for (edge exit : exits)
989 if (!get_loop_exit_condition (exit))
990 continue;
992 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
993 && !chrec_contains_undetermined (niter_desc.niter))
995 tree may_be_zero = niter_desc.may_be_zero;
996 if ((integer_zerop (may_be_zero)
997 /* As we are handling may_be_zero that's not false by
998 rewriting niter to may_be_zero ? 0 : niter we require
999 an empty latch. */
1000 || (single_pred_p (loop->latch)
1001 && exit->src == single_pred (loop->latch)
1002 && (integer_nonzerop (may_be_zero)
1003 || COMPARISON_CLASS_P (may_be_zero))))
1004 && (!candidate
1005 || dominated_by_p (CDI_DOMINATORS, exit->src,
1006 candidate->src)))
1007 candidate = exit;
1011 return candidate;
1014 /* Function bb_in_loop_p
1016 Used as predicate for dfs order traversal of the loop bbs. */
1018 static bool
1019 bb_in_loop_p (const_basic_block bb, const void *data)
1021 const class loop *const loop = (const class loop *)data;
1022 if (flow_bb_inside_loop_p (loop, bb))
1023 return true;
1024 return false;
1028 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1029 stmt_vec_info structs for all the stmts in LOOP_IN. */
1031 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1032 : vec_info (vec_info::loop, shared),
1033 loop (loop_in),
1034 num_itersm1 (NULL_TREE),
1035 num_iters (NULL_TREE),
1036 num_iters_unchanged (NULL_TREE),
1037 num_iters_assumptions (NULL_TREE),
1038 vector_costs (nullptr),
1039 scalar_costs (nullptr),
1040 th (0),
1041 versioning_threshold (0),
1042 vectorization_factor (0),
1043 main_loop_edge (nullptr),
1044 skip_main_loop_edge (nullptr),
1045 skip_this_loop_edge (nullptr),
1046 reusable_accumulators (),
1047 suggested_unroll_factor (1),
1048 max_vectorization_factor (0),
1049 mask_skip_niters (NULL_TREE),
1050 rgroup_compare_type (NULL_TREE),
1051 simd_if_cond (NULL_TREE),
1052 partial_vector_style (vect_partial_vectors_none),
1053 unaligned_dr (NULL),
1054 peeling_for_alignment (0),
1055 ptr_mask (0),
1056 ivexpr_map (NULL),
1057 scan_map (NULL),
1058 slp_unrolling_factor (1),
1059 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1060 vectorizable (false),
1061 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1062 using_partial_vectors_p (false),
1063 using_decrementing_iv_p (false),
1064 using_select_vl_p (false),
1065 epil_using_partial_vectors_p (false),
1066 partial_load_store_bias (0),
1067 peeling_for_gaps (false),
1068 peeling_for_niter (false),
1069 early_breaks (false),
1070 no_data_dependencies (false),
1071 has_mask_store (false),
1072 scalar_loop_scaling (profile_probability::uninitialized ()),
1073 scalar_loop (NULL),
1074 orig_loop_info (NULL),
1075 vec_loop_iv_exit (NULL),
1076 vec_epilogue_loop_iv_exit (NULL),
1077 scalar_loop_iv_exit (NULL)
1079 /* CHECKME: We want to visit all BBs before their successors (except for
1080 latch blocks, for which this assertion wouldn't hold). In the simple
1081 case of the loop forms we allow, a dfs order of the BBs would the same
1082 as reversed postorder traversal, so we are safe. */
1084 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1085 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1086 loop->num_nodes, loop);
1087 gcc_assert (nbbs == loop->num_nodes);
1089 for (unsigned int i = 0; i < nbbs; i++)
1091 basic_block bb = bbs[i];
1092 gimple_stmt_iterator si;
1094 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1096 gimple *phi = gsi_stmt (si);
1097 gimple_set_uid (phi, 0);
1098 add_stmt (phi);
1101 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103 gimple *stmt = gsi_stmt (si);
1104 gimple_set_uid (stmt, 0);
1105 if (is_gimple_debug (stmt))
1106 continue;
1107 add_stmt (stmt);
1108 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1109 third argument is the #pragma omp simd if (x) condition, when 0,
1110 loop shouldn't be vectorized, when non-zero constant, it should
1111 be vectorized normally, otherwise versioned with vectorized loop
1112 done if the condition is non-zero at runtime. */
1113 if (loop_in->simduid
1114 && is_gimple_call (stmt)
1115 && gimple_call_internal_p (stmt)
1116 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1117 && gimple_call_num_args (stmt) >= 3
1118 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1119 && (loop_in->simduid
1120 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1122 tree arg = gimple_call_arg (stmt, 2);
1123 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1124 simd_if_cond = arg;
1125 else
1126 gcc_assert (integer_nonzerop (arg));
1131 epilogue_vinfos.create (6);
1134 /* Free all levels of rgroup CONTROLS. */
1136 void
1137 release_vec_loop_controls (vec<rgroup_controls> *controls)
1139 rgroup_controls *rgc;
1140 unsigned int i;
1141 FOR_EACH_VEC_ELT (*controls, i, rgc)
1142 rgc->controls.release ();
1143 controls->release ();
1146 /* Free all memory used by the _loop_vec_info, as well as all the
1147 stmt_vec_info structs of all the stmts in the loop. */
1149 _loop_vec_info::~_loop_vec_info ()
1151 free (bbs);
1153 release_vec_loop_controls (&masks.rgc_vec);
1154 release_vec_loop_controls (&lens);
1155 delete ivexpr_map;
1156 delete scan_map;
1157 epilogue_vinfos.release ();
1158 delete scalar_costs;
1159 delete vector_costs;
1161 /* When we release an epiloge vinfo that we do not intend to use
1162 avoid clearing AUX of the main loop which should continue to
1163 point to the main loop vinfo since otherwise we'll leak that. */
1164 if (loop->aux == this)
1165 loop->aux = NULL;
1168 /* Return an invariant or register for EXPR and emit necessary
1169 computations in the LOOP_VINFO loop preheader. */
1171 tree
1172 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1174 if (is_gimple_reg (expr)
1175 || is_gimple_min_invariant (expr))
1176 return expr;
1178 if (! loop_vinfo->ivexpr_map)
1179 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1180 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1181 if (! cached)
1183 gimple_seq stmts = NULL;
1184 cached = force_gimple_operand (unshare_expr (expr),
1185 &stmts, true, NULL_TREE);
1186 if (stmts)
1188 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1189 gsi_insert_seq_on_edge_immediate (e, stmts);
1192 return cached;
1195 /* Return true if we can use CMP_TYPE as the comparison type to produce
1196 all masks required to mask LOOP_VINFO. */
1198 static bool
1199 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1201 rgroup_controls *rgm;
1202 unsigned int i;
1203 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1204 if (rgm->type != NULL_TREE
1205 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1206 cmp_type, rgm->type,
1207 OPTIMIZE_FOR_SPEED))
1208 return false;
1209 return true;
1212 /* Calculate the maximum number of scalars per iteration for every
1213 rgroup in LOOP_VINFO. */
1215 static unsigned int
1216 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1218 unsigned int res = 1;
1219 unsigned int i;
1220 rgroup_controls *rgm;
1221 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1222 res = MAX (res, rgm->max_nscalars_per_iter);
1223 return res;
1226 /* Calculate the minimum precision necessary to represent:
1228 MAX_NITERS * FACTOR
1230 as an unsigned integer, where MAX_NITERS is the maximum number of
1231 loop header iterations for the original scalar form of LOOP_VINFO. */
1233 static unsigned
1234 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1236 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1238 /* Get the maximum number of iterations that is representable
1239 in the counter type. */
1240 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1241 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1243 /* Get a more refined estimate for the number of iterations. */
1244 widest_int max_back_edges;
1245 if (max_loop_iterations (loop, &max_back_edges))
1246 max_ni = wi::smin (max_ni, max_back_edges + 1);
1248 /* Work out how many bits we need to represent the limit. */
1249 return wi::min_precision (max_ni * factor, UNSIGNED);
1252 /* True if the loop needs peeling or partial vectors when vectorized. */
1254 static bool
1255 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1257 unsigned HOST_WIDE_INT const_vf;
1258 HOST_WIDE_INT max_niter
1259 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1261 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1262 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1263 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1264 (loop_vinfo));
1266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1269 /* Work out the (constant) number of iterations that need to be
1270 peeled for reasons other than niters. */
1271 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1272 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1273 peel_niter += 1;
1274 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1275 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1276 return true;
1278 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1279 /* ??? When peeling for gaps but not alignment, we could
1280 try to check whether the (variable) niters is known to be
1281 VF * N + 1. That's something of a niche case though. */
1282 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1283 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1284 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1285 < (unsigned) exact_log2 (const_vf))
1286 /* In case of versioning, check if the maximum number of
1287 iterations is greater than th. If they are identical,
1288 the epilogue is unnecessary. */
1289 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1290 || ((unsigned HOST_WIDE_INT) max_niter
1291 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1292 but that's only computed later based on our result.
1293 The following is the most conservative approximation. */
1294 > (std::max ((unsigned HOST_WIDE_INT) th,
1295 const_vf) / const_vf) * const_vf))))
1296 return true;
1298 return false;
1301 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1302 whether we can actually generate the masks required. Return true if so,
1303 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1305 static bool
1306 vect_verify_full_masking (loop_vec_info loop_vinfo)
1308 unsigned int min_ni_width;
1310 /* Use a normal loop if there are no statements that need masking.
1311 This only happens in rare degenerate cases: it means that the loop
1312 has no loads, no stores, and no live-out values. */
1313 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1314 return false;
1316 /* Produce the rgroup controls. */
1317 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1319 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1320 tree vectype = mask.first;
1321 unsigned nvectors = mask.second;
1323 if (masks->rgc_vec.length () < nvectors)
1324 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1325 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1326 /* The number of scalars per iteration and the number of vectors are
1327 both compile-time constants. */
1328 unsigned int nscalars_per_iter
1329 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1330 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1332 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1334 rgm->max_nscalars_per_iter = nscalars_per_iter;
1335 rgm->type = truth_type_for (vectype);
1336 rgm->factor = 1;
1340 unsigned int max_nscalars_per_iter
1341 = vect_get_max_nscalars_per_iter (loop_vinfo);
1343 /* Work out how many bits we need to represent the limit. */
1344 min_ni_width
1345 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1347 /* Find a scalar mode for which WHILE_ULT is supported. */
1348 opt_scalar_int_mode cmp_mode_iter;
1349 tree cmp_type = NULL_TREE;
1350 tree iv_type = NULL_TREE;
1351 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1352 unsigned int iv_precision = UINT_MAX;
1354 if (iv_limit != -1)
1355 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1356 UNSIGNED);
1358 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1360 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1361 if (cmp_bits >= min_ni_width
1362 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1364 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1365 if (this_type
1366 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1368 /* Although we could stop as soon as we find a valid mode,
1369 there are at least two reasons why that's not always the
1370 best choice:
1372 - An IV that's Pmode or wider is more likely to be reusable
1373 in address calculations than an IV that's narrower than
1374 Pmode.
1376 - Doing the comparison in IV_PRECISION or wider allows
1377 a natural 0-based IV, whereas using a narrower comparison
1378 type requires mitigations against wrap-around.
1380 Conversely, if the IV limit is variable, doing the comparison
1381 in a wider type than the original type can introduce
1382 unnecessary extensions, so picking the widest valid mode
1383 is not always a good choice either.
1385 Here we prefer the first IV type that's Pmode or wider,
1386 and the first comparison type that's IV_PRECISION or wider.
1387 (The comparison type must be no wider than the IV type,
1388 to avoid extensions in the vector loop.)
1390 ??? We might want to try continuing beyond Pmode for ILP32
1391 targets if CMP_BITS < IV_PRECISION. */
1392 iv_type = this_type;
1393 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1394 cmp_type = this_type;
1395 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1396 break;
1401 if (!cmp_type)
1403 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1404 return false;
1407 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1408 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1409 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1410 return true;
1413 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1414 whether we can actually generate AVX512 style masks. Return true if so,
1415 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1417 static bool
1418 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1420 /* Produce differently organized rgc_vec and differently check
1421 we can produce masks. */
1423 /* Use a normal loop if there are no statements that need masking.
1424 This only happens in rare degenerate cases: it means that the loop
1425 has no loads, no stores, and no live-out values. */
1426 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1427 return false;
1429 /* For the decrementing IV we need to represent all values in
1430 [0, niter + niter_skip] where niter_skip is the elements we
1431 skip in the first iteration for prologue peeling. */
1432 tree iv_type = NULL_TREE;
1433 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1434 unsigned int iv_precision = UINT_MAX;
1435 if (iv_limit != -1)
1436 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1438 /* First compute the type for the IV we use to track the remaining
1439 scalar iterations. */
1440 opt_scalar_int_mode cmp_mode_iter;
1441 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1443 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1444 if (cmp_bits >= iv_precision
1445 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1447 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1448 if (iv_type)
1449 break;
1452 if (!iv_type)
1453 return false;
1455 /* Produce the rgroup controls. */
1456 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1458 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1459 tree vectype = mask.first;
1460 unsigned nvectors = mask.second;
1462 /* The number of scalars per iteration and the number of vectors are
1463 both compile-time constants. */
1464 unsigned int nscalars_per_iter
1465 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1466 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1468 /* We index the rgroup_controls vector with nscalars_per_iter
1469 which we keep constant and instead have a varying nvectors,
1470 remembering the vector mask with the fewest nV. */
1471 if (masks->rgc_vec.length () < nscalars_per_iter)
1472 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1473 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1475 if (!rgm->type || rgm->factor > nvectors)
1477 rgm->type = truth_type_for (vectype);
1478 rgm->compare_type = NULL_TREE;
1479 rgm->max_nscalars_per_iter = nscalars_per_iter;
1480 rgm->factor = nvectors;
1481 rgm->bias_adjusted_ctrl = NULL_TREE;
1485 /* There is no fixed compare type we are going to use but we have to
1486 be able to get at one for each mask group. */
1487 unsigned int min_ni_width
1488 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1490 bool ok = true;
1491 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1493 tree mask_type = rgc.type;
1494 if (!mask_type)
1495 continue;
1497 /* For now vect_get_loop_mask only supports integer mode masks
1498 when we need to split it. */
1499 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1500 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1502 ok = false;
1503 break;
1506 /* If iv_type is usable as compare type use that - we can elide the
1507 saturation in that case. */
1508 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1510 tree cmp_vectype
1511 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1512 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1513 rgc.compare_type = cmp_vectype;
1515 if (!rgc.compare_type)
1516 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1518 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1519 if (cmp_bits >= min_ni_width
1520 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1522 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1523 if (!cmp_type)
1524 continue;
1526 /* Check whether we can produce the mask with cmp_type. */
1527 tree cmp_vectype
1528 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1529 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1531 rgc.compare_type = cmp_vectype;
1532 break;
1536 if (!rgc.compare_type)
1538 ok = false;
1539 break;
1542 if (!ok)
1544 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1545 return false;
1548 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1549 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1550 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1551 return true;
1554 /* Check whether we can use vector access with length based on precison
1555 comparison. So far, to keep it simple, we only allow the case that the
1556 precision of the target supported length is larger than the precision
1557 required by loop niters. */
1559 static bool
1560 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1562 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1563 return false;
1565 machine_mode len_load_mode, len_store_mode;
1566 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1567 .exists (&len_load_mode))
1568 return false;
1569 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1570 .exists (&len_store_mode))
1571 return false;
1573 signed char partial_load_bias = internal_len_load_store_bias
1574 (IFN_LEN_LOAD, len_load_mode);
1576 signed char partial_store_bias = internal_len_load_store_bias
1577 (IFN_LEN_STORE, len_store_mode);
1579 gcc_assert (partial_load_bias == partial_store_bias);
1581 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1582 return false;
1584 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1585 len_loads with a length of zero. In order to avoid that we prohibit
1586 more than one loop length here. */
1587 if (partial_load_bias == -1
1588 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1589 return false;
1591 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1593 unsigned int max_nitems_per_iter = 1;
1594 unsigned int i;
1595 rgroup_controls *rgl;
1596 /* Find the maximum number of items per iteration for every rgroup. */
1597 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1599 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1600 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1603 /* Work out how many bits we need to represent the length limit. */
1604 unsigned int min_ni_prec
1605 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1607 /* Now use the maximum of below precisions for one suitable IV type:
1608 - the IV's natural precision
1609 - the precision needed to hold: the maximum number of scalar
1610 iterations multiplied by the scale factor (min_ni_prec above)
1611 - the Pmode precision
1613 If min_ni_prec is less than the precision of the current niters,
1614 we perfer to still use the niters type. Prefer to use Pmode and
1615 wider IV to avoid narrow conversions. */
1617 unsigned int ni_prec
1618 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1619 min_ni_prec = MAX (min_ni_prec, ni_prec);
1620 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1622 tree iv_type = NULL_TREE;
1623 opt_scalar_int_mode tmode_iter;
1624 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1626 scalar_mode tmode = tmode_iter.require ();
1627 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1629 /* ??? Do we really want to construct one IV whose precision exceeds
1630 BITS_PER_WORD? */
1631 if (tbits > BITS_PER_WORD)
1632 break;
1634 /* Find the first available standard integral type. */
1635 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1637 iv_type = build_nonstandard_integer_type (tbits, true);
1638 break;
1642 if (!iv_type)
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 "can't vectorize with length-based partial vectors"
1647 " because there is no suitable iv type.\n");
1648 return false;
1651 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1652 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1653 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1655 return true;
1658 /* Calculate the cost of one scalar iteration of the loop. */
1659 static void
1660 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1662 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1664 int nbbs = loop->num_nodes, factor;
1665 int innerloop_iters, i;
1667 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1669 /* Gather costs for statements in the scalar loop. */
1671 /* FORNOW. */
1672 innerloop_iters = 1;
1673 if (loop->inner)
1674 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1676 for (i = 0; i < nbbs; i++)
1678 gimple_stmt_iterator si;
1679 basic_block bb = bbs[i];
1681 if (bb->loop_father == loop->inner)
1682 factor = innerloop_iters;
1683 else
1684 factor = 1;
1686 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1688 gimple *stmt = gsi_stmt (si);
1689 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1691 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1692 continue;
1694 /* Skip stmts that are not vectorized inside the loop. */
1695 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1696 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1697 && (!STMT_VINFO_LIVE_P (vstmt_info)
1698 || !VECTORIZABLE_CYCLE_DEF
1699 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1700 continue;
1702 vect_cost_for_stmt kind;
1703 if (STMT_VINFO_DATA_REF (stmt_info))
1705 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1706 kind = scalar_load;
1707 else
1708 kind = scalar_store;
1710 else if (vect_nop_conversion_p (stmt_info))
1711 continue;
1712 else
1713 kind = scalar_stmt;
1715 /* We are using vect_prologue here to avoid scaling twice
1716 by the inner loop factor. */
1717 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1718 factor, kind, stmt_info, 0, vect_prologue);
1722 /* Now accumulate cost. */
1723 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1724 add_stmt_costs (loop_vinfo->scalar_costs,
1725 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1726 loop_vinfo->scalar_costs->finish_cost (nullptr);
1729 /* Function vect_analyze_loop_form.
1731 Verify that certain CFG restrictions hold, including:
1732 - the loop has a pre-header
1733 - the loop has a single entry
1734 - nested loops can have only a single exit.
1735 - the loop exit condition is simple enough
1736 - the number of iterations can be analyzed, i.e, a countable loop. The
1737 niter could be analyzed under some assumptions. */
1739 opt_result
1740 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1742 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1744 edge exit_e = vec_init_loop_exit_info (loop);
1745 if (!exit_e)
1746 return opt_result::failure_at (vect_location,
1747 "not vectorized:"
1748 " could not determine main exit from"
1749 " loop with multiple exits.\n");
1750 info->loop_exit = exit_e;
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "using as main loop exit: %d -> %d [AUX: %p]\n",
1754 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1756 /* Check if we have any control flow that doesn't leave the loop. */
1757 class loop *v_loop = loop->inner ? loop->inner : loop;
1758 basic_block *bbs = get_loop_body (v_loop);
1759 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1760 if (EDGE_COUNT (bbs[i]->succs) != 1
1761 && (EDGE_COUNT (bbs[i]->succs) != 2
1762 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1764 free (bbs);
1765 return opt_result::failure_at (vect_location,
1766 "not vectorized:"
1767 " unsupported control flow in loop.\n");
1769 free (bbs);
1771 /* Different restrictions apply when we are considering an inner-most loop,
1772 vs. an outer (nested) loop.
1773 (FORNOW. May want to relax some of these restrictions in the future). */
1775 info->inner_loop_cond = NULL;
1776 if (!loop->inner)
1778 /* Inner-most loop. */
1780 if (empty_block_p (loop->header))
1781 return opt_result::failure_at (vect_location,
1782 "not vectorized: empty loop.\n");
1784 else
1786 class loop *innerloop = loop->inner;
1787 edge entryedge;
1789 /* Nested loop. We currently require that the loop is doubly-nested,
1790 contains a single inner loop with a single exit to the block
1791 with the single exit condition in the outer loop.
1792 Vectorizable outer-loops look like this:
1794 (pre-header)
1796 header <---+
1798 inner-loop |
1800 tail ------+
1802 (exit-bb)
1804 The inner-loop also has the properties expected of inner-most loops
1805 as described above. */
1807 if ((loop->inner)->inner || (loop->inner)->next)
1808 return opt_result::failure_at (vect_location,
1809 "not vectorized:"
1810 " multiple nested loops.\n");
1812 entryedge = loop_preheader_edge (innerloop);
1813 if (entryedge->src != loop->header
1814 || !single_exit (innerloop)
1815 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1816 return opt_result::failure_at (vect_location,
1817 "not vectorized:"
1818 " unsupported outerloop form.\n");
1820 /* Analyze the inner-loop. */
1821 vect_loop_form_info inner;
1822 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1823 if (!res)
1825 if (dump_enabled_p ())
1826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1827 "not vectorized: Bad inner loop.\n");
1828 return res;
1831 /* Don't support analyzing niter under assumptions for inner
1832 loop. */
1833 if (!integer_onep (inner.assumptions))
1834 return opt_result::failure_at (vect_location,
1835 "not vectorized: Bad inner loop.\n");
1837 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1838 return opt_result::failure_at (vect_location,
1839 "not vectorized: inner-loop count not"
1840 " invariant.\n");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "Considering outer-loop vectorization.\n");
1845 info->inner_loop_cond = inner.conds[0];
1848 if (EDGE_COUNT (loop->header->preds) != 2)
1849 return opt_result::failure_at (vect_location,
1850 "not vectorized:"
1851 " too many incoming edges.\n");
1853 /* We assume that the latch is empty. */
1854 if (!empty_block_p (loop->latch)
1855 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1856 return opt_result::failure_at (vect_location,
1857 "not vectorized: latch block not empty.\n");
1859 /* Make sure there is no abnormal exit. */
1860 auto_vec<edge> exits = get_loop_exit_edges (loop);
1861 for (edge e : exits)
1863 if (e->flags & EDGE_ABNORMAL)
1864 return opt_result::failure_at (vect_location,
1865 "not vectorized:"
1866 " abnormal loop exit edge.\n");
1869 info->conds
1870 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1871 &info->number_of_iterations,
1872 &info->number_of_iterationsm1);
1873 if (info->conds.is_empty ())
1874 return opt_result::failure_at
1875 (vect_location,
1876 "not vectorized: complicated exit condition.\n");
1878 /* Determine what the primary and alternate exit conds are. */
1879 for (unsigned i = 0; i < info->conds.length (); i++)
1881 gcond *cond = info->conds[i];
1882 if (exit_e->src == gimple_bb (cond))
1883 std::swap (info->conds[0], info->conds[i]);
1886 if (integer_zerop (info->assumptions)
1887 || !info->number_of_iterations
1888 || chrec_contains_undetermined (info->number_of_iterations))
1889 return opt_result::failure_at
1890 (info->conds[0],
1891 "not vectorized: number of iterations cannot be computed.\n");
1893 if (integer_zerop (info->number_of_iterations))
1894 return opt_result::failure_at
1895 (info->conds[0],
1896 "not vectorized: number of iterations = 0.\n");
1898 if (!(tree_fits_shwi_p (info->number_of_iterations)
1899 && tree_to_shwi (info->number_of_iterations) > 0))
1901 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_NOTE, vect_location,
1904 "Symbolic number of iterations is ");
1905 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1906 dump_printf (MSG_NOTE, "\n");
1910 return opt_result::success ();
1913 /* Create a loop_vec_info for LOOP with SHARED and the
1914 vect_analyze_loop_form result. */
1916 loop_vec_info
1917 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1918 const vect_loop_form_info *info,
1919 loop_vec_info main_loop_info)
1921 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1922 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1923 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1924 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1925 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1926 /* Also record the assumptions for versioning. */
1927 if (!integer_onep (info->assumptions) && !main_loop_info)
1928 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1930 for (gcond *cond : info->conds)
1932 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1933 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1934 /* Mark the statement as a condition. */
1935 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1938 for (unsigned i = 1; i < info->conds.length (); i ++)
1939 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1940 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1942 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1944 /* Check to see if we're vectorizing multiple exits. */
1945 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1946 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1948 if (info->inner_loop_cond)
1950 stmt_vec_info inner_loop_cond_info
1951 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1952 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1953 /* If we have an estimate on the number of iterations of the inner
1954 loop use that to limit the scale for costing, otherwise use
1955 --param vect-inner-loop-cost-factor literally. */
1956 widest_int nit;
1957 if (estimated_stmt_executions (loop->inner, &nit))
1958 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1959 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1962 return loop_vinfo;
1967 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1968 statements update the vectorization factor. */
1970 static void
1971 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1973 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1974 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1975 int nbbs = loop->num_nodes;
1976 poly_uint64 vectorization_factor;
1977 int i;
1979 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1981 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1982 gcc_assert (known_ne (vectorization_factor, 0U));
1984 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1985 vectorization factor of the loop is the unrolling factor required by
1986 the SLP instances. If that unrolling factor is 1, we say, that we
1987 perform pure SLP on loop - cross iteration parallelism is not
1988 exploited. */
1989 bool only_slp_in_loop = true;
1990 for (i = 0; i < nbbs; i++)
1992 basic_block bb = bbs[i];
1993 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1994 gsi_next (&si))
1996 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1997 if (!stmt_info)
1998 continue;
1999 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2000 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2001 && !PURE_SLP_STMT (stmt_info))
2002 /* STMT needs both SLP and loop-based vectorization. */
2003 only_slp_in_loop = false;
2005 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2006 gsi_next (&si))
2008 if (is_gimple_debug (gsi_stmt (si)))
2009 continue;
2010 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2011 stmt_info = vect_stmt_to_vectorize (stmt_info);
2012 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2013 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2014 && !PURE_SLP_STMT (stmt_info))
2015 /* STMT needs both SLP and loop-based vectorization. */
2016 only_slp_in_loop = false;
2020 if (only_slp_in_loop)
2022 if (dump_enabled_p ())
2023 dump_printf_loc (MSG_NOTE, vect_location,
2024 "Loop contains only SLP stmts\n");
2025 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2027 else
2029 if (dump_enabled_p ())
2030 dump_printf_loc (MSG_NOTE, vect_location,
2031 "Loop contains SLP and non-SLP stmts\n");
2032 /* Both the vectorization factor and unroll factor have the form
2033 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2034 so they must have a common multiple. */
2035 vectorization_factor
2036 = force_common_multiple (vectorization_factor,
2037 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2040 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2041 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "Updating vectorization factor to ");
2045 dump_dec (MSG_NOTE, vectorization_factor);
2046 dump_printf (MSG_NOTE, ".\n");
2050 /* Return true if STMT_INFO describes a double reduction phi and if
2051 the other phi in the reduction is also relevant for vectorization.
2052 This rejects cases such as:
2054 outer1:
2055 x_1 = PHI <x_3(outer2), ...>;
2058 inner:
2059 x_2 = ...;
2062 outer2:
2063 x_3 = PHI <x_2(inner)>;
2065 if nothing in x_2 or elsewhere makes x_1 relevant. */
2067 static bool
2068 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2070 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2071 return false;
2073 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2076 /* Function vect_analyze_loop_operations.
2078 Scan the loop stmts and make sure they are all vectorizable. */
2080 static opt_result
2081 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2083 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2084 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2085 int nbbs = loop->num_nodes;
2086 int i;
2087 stmt_vec_info stmt_info;
2088 bool need_to_vectorize = false;
2089 bool ok;
2091 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2093 auto_vec<stmt_info_for_cost> cost_vec;
2095 for (i = 0; i < nbbs; i++)
2097 basic_block bb = bbs[i];
2099 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2100 gsi_next (&si))
2102 gphi *phi = si.phi ();
2103 ok = true;
2105 stmt_info = loop_vinfo->lookup_stmt (phi);
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2108 (gimple *) phi);
2109 if (virtual_operand_p (gimple_phi_result (phi)))
2110 continue;
2112 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2113 (i.e., a phi in the tail of the outer-loop). */
2114 if (! is_loop_header_bb_p (bb))
2116 /* FORNOW: we currently don't support the case that these phis
2117 are not used in the outerloop (unless it is double reduction,
2118 i.e., this phi is vect_reduction_def), cause this case
2119 requires to actually do something here. */
2120 if (STMT_VINFO_LIVE_P (stmt_info)
2121 && !vect_active_double_reduction_p (stmt_info))
2122 return opt_result::failure_at (phi,
2123 "Unsupported loop-closed phi"
2124 " in outer-loop.\n");
2126 /* If PHI is used in the outer loop, we check that its operand
2127 is defined in the inner loop. */
2128 if (STMT_VINFO_RELEVANT_P (stmt_info))
2130 tree phi_op;
2132 if (gimple_phi_num_args (phi) != 1)
2133 return opt_result::failure_at (phi, "unsupported phi");
2135 phi_op = PHI_ARG_DEF (phi, 0);
2136 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2137 if (!op_def_info)
2138 return opt_result::failure_at (phi, "unsupported phi\n");
2140 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2141 && (STMT_VINFO_RELEVANT (op_def_info)
2142 != vect_used_in_outer_by_reduction))
2143 return opt_result::failure_at (phi, "unsupported phi\n");
2145 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2146 || (STMT_VINFO_DEF_TYPE (stmt_info)
2147 == vect_double_reduction_def))
2148 && !vectorizable_lc_phi (loop_vinfo,
2149 stmt_info, NULL, NULL))
2150 return opt_result::failure_at (phi, "unsupported phi\n");
2153 continue;
2156 gcc_assert (stmt_info);
2158 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2159 || STMT_VINFO_LIVE_P (stmt_info))
2160 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2161 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2162 /* A scalar-dependence cycle that we don't support. */
2163 return opt_result::failure_at (phi,
2164 "not vectorized:"
2165 " scalar dependence cycle.\n");
2167 if (STMT_VINFO_RELEVANT_P (stmt_info))
2169 need_to_vectorize = true;
2170 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2171 && ! PURE_SLP_STMT (stmt_info))
2172 ok = vectorizable_induction (loop_vinfo,
2173 stmt_info, NULL, NULL,
2174 &cost_vec);
2175 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2176 || (STMT_VINFO_DEF_TYPE (stmt_info)
2177 == vect_double_reduction_def)
2178 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2179 && ! PURE_SLP_STMT (stmt_info))
2180 ok = vectorizable_reduction (loop_vinfo,
2181 stmt_info, NULL, NULL, &cost_vec);
2182 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2183 == vect_first_order_recurrence)
2184 && ! PURE_SLP_STMT (stmt_info))
2185 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2186 &cost_vec);
2189 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2190 if (ok
2191 && STMT_VINFO_LIVE_P (stmt_info)
2192 && !PURE_SLP_STMT (stmt_info))
2193 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2194 -1, false, &cost_vec);
2196 if (!ok)
2197 return opt_result::failure_at (phi,
2198 "not vectorized: relevant phi not "
2199 "supported: %G",
2200 static_cast <gimple *> (phi));
2203 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2204 gsi_next (&si))
2206 gimple *stmt = gsi_stmt (si);
2207 if (!gimple_clobber_p (stmt)
2208 && !is_gimple_debug (stmt))
2210 opt_result res
2211 = vect_analyze_stmt (loop_vinfo,
2212 loop_vinfo->lookup_stmt (stmt),
2213 &need_to_vectorize,
2214 NULL, NULL, &cost_vec);
2215 if (!res)
2216 return res;
2219 } /* bbs */
2221 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2223 /* All operations in the loop are either irrelevant (deal with loop
2224 control, or dead), or only used outside the loop and can be moved
2225 out of the loop (e.g. invariants, inductions). The loop can be
2226 optimized away by scalar optimizations. We're better off not
2227 touching this loop. */
2228 if (!need_to_vectorize)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_NOTE, vect_location,
2232 "All the computation can be taken out of the loop.\n");
2233 return opt_result::failure_at
2234 (vect_location,
2235 "not vectorized: redundant loop. no profit to vectorize.\n");
2238 return opt_result::success ();
2241 /* Return true if we know that the iteration count is smaller than the
2242 vectorization factor. Return false if it isn't, or if we can't be sure
2243 either way. */
2245 static bool
2246 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2248 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2250 HOST_WIDE_INT max_niter;
2251 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2252 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2253 else
2254 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2256 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2257 return true;
2259 return false;
2262 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2263 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2264 definitely no, or -1 if it's worth retrying. */
2266 static int
2267 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2268 unsigned *suggested_unroll_factor)
2270 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2271 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2273 /* Only loops that can handle partially-populated vectors can have iteration
2274 counts less than the vectorization factor. */
2275 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2276 && vect_known_niters_smaller_than_vf (loop_vinfo))
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 "not vectorized: iteration count smaller than "
2281 "vectorization factor.\n");
2282 return 0;
2285 /* If we know the number of iterations we can do better, for the
2286 epilogue we can also decide whether the main loop leaves us
2287 with enough iterations, prefering a smaller vector epilog then
2288 also possibly used for the case we skip the vector loop. */
2289 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2291 widest_int scalar_niters
2292 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2293 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2295 loop_vec_info orig_loop_vinfo
2296 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2297 unsigned lowest_vf
2298 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2299 int prolog_peeling = 0;
2300 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2301 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2302 if (prolog_peeling >= 0
2303 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2304 lowest_vf))
2306 unsigned gap
2307 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2308 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2309 % lowest_vf + gap);
2312 /* Reject vectorizing for a single scalar iteration, even if
2313 we could in principle implement that using partial vectors. */
2314 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2315 if (scalar_niters <= peeling_gap + 1)
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319 "not vectorized: loop only has a single "
2320 "scalar iteration.\n");
2321 return 0;
2324 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2326 /* Check that the loop processes at least one full vector. */
2327 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2328 if (known_lt (scalar_niters, vf))
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332 "loop does not have enough iterations "
2333 "to support vectorization.\n");
2334 return 0;
2337 /* If we need to peel an extra epilogue iteration to handle data
2338 accesses with gaps, check that there are enough scalar iterations
2339 available.
2341 The check above is redundant with this one when peeling for gaps,
2342 but the distinction is useful for diagnostics. */
2343 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2344 && known_le (scalar_niters, vf))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 "loop does not have enough iterations "
2349 "to support peeling for gaps.\n");
2350 return 0;
2355 /* If using the "very cheap" model. reject cases in which we'd keep
2356 a copy of the scalar code (even if we might be able to vectorize it). */
2357 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2358 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2359 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2360 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "some scalar iterations would need to be peeled\n");
2365 return 0;
2368 int min_profitable_iters, min_profitable_estimate;
2369 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2370 &min_profitable_estimate,
2371 suggested_unroll_factor);
2373 if (min_profitable_iters < 0)
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "not vectorized: vectorization not profitable.\n");
2378 if (dump_enabled_p ())
2379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380 "not vectorized: vector version will never be "
2381 "profitable.\n");
2382 return -1;
2385 int min_scalar_loop_bound = (param_min_vect_loop_bound
2386 * assumed_vf);
2388 /* Use the cost model only if it is more conservative than user specified
2389 threshold. */
2390 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2391 min_profitable_iters);
2393 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2395 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2396 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2400 "not vectorized: vectorization not profitable.\n");
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE, vect_location,
2403 "not vectorized: iteration count smaller than user "
2404 "specified loop bound parameter or minimum profitable "
2405 "iterations (whichever is more conservative).\n");
2406 return 0;
2409 /* The static profitablity threshold min_profitable_estimate includes
2410 the cost of having to check at runtime whether the scalar loop
2411 should be used instead. If it turns out that we don't need or want
2412 such a check, the threshold we should use for the static estimate
2413 is simply the point at which the vector loop becomes more profitable
2414 than the scalar loop. */
2415 if (min_profitable_estimate > min_profitable_iters
2416 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2417 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2418 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2419 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2423 " choice between the scalar and vector loops\n");
2424 min_profitable_estimate = min_profitable_iters;
2427 /* If the vector loop needs multiple iterations to be beneficial then
2428 things are probably too close to call, and the conservative thing
2429 would be to stick with the scalar code. */
2430 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2431 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435 "one iteration of the vector loop would be"
2436 " more expensive than the equivalent number of"
2437 " iterations of the scalar loop\n");
2438 return 0;
2441 HOST_WIDE_INT estimated_niter;
2443 /* If we are vectorizing an epilogue then we know the maximum number of
2444 scalar iterations it will cover is at least one lower than the
2445 vectorization factor of the main loop. */
2446 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2447 estimated_niter
2448 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2449 else
2451 estimated_niter = estimated_stmt_executions_int (loop);
2452 if (estimated_niter == -1)
2453 estimated_niter = likely_max_stmt_executions_int (loop);
2455 if (estimated_niter != -1
2456 && ((unsigned HOST_WIDE_INT) estimated_niter
2457 < MAX (th, (unsigned) min_profitable_estimate)))
2459 if (dump_enabled_p ())
2460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2461 "not vectorized: estimated iteration count too "
2462 "small.\n");
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_NOTE, vect_location,
2465 "not vectorized: estimated iteration count smaller "
2466 "than specified loop bound parameter or minimum "
2467 "profitable iterations (whichever is more "
2468 "conservative).\n");
2469 return -1;
2472 return 1;
2475 static opt_result
2476 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2477 vec<data_reference_p> *datarefs,
2478 unsigned int *n_stmts)
2480 *n_stmts = 0;
2481 for (unsigned i = 0; i < loop->num_nodes; i++)
2482 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2483 !gsi_end_p (gsi); gsi_next (&gsi))
2485 gimple *stmt = gsi_stmt (gsi);
2486 if (is_gimple_debug (stmt))
2487 continue;
2488 ++(*n_stmts);
2489 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2490 NULL, 0);
2491 if (!res)
2493 if (is_gimple_call (stmt) && loop->safelen)
2495 tree fndecl = gimple_call_fndecl (stmt), op;
2496 if (fndecl == NULL_TREE
2497 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2499 fndecl = gimple_call_arg (stmt, 0);
2500 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2501 fndecl = TREE_OPERAND (fndecl, 0);
2502 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2504 if (fndecl != NULL_TREE)
2506 cgraph_node *node = cgraph_node::get (fndecl);
2507 if (node != NULL && node->simd_clones != NULL)
2509 unsigned int j, n = gimple_call_num_args (stmt);
2510 for (j = 0; j < n; j++)
2512 op = gimple_call_arg (stmt, j);
2513 if (DECL_P (op)
2514 || (REFERENCE_CLASS_P (op)
2515 && get_base_address (op)))
2516 break;
2518 op = gimple_call_lhs (stmt);
2519 /* Ignore #pragma omp declare simd functions
2520 if they don't have data references in the
2521 call stmt itself. */
2522 if (j == n
2523 && !(op
2524 && (DECL_P (op)
2525 || (REFERENCE_CLASS_P (op)
2526 && get_base_address (op)))))
2527 continue;
2531 return res;
2533 /* If dependence analysis will give up due to the limit on the
2534 number of datarefs stop here and fail fatally. */
2535 if (datarefs->length ()
2536 > (unsigned)param_loop_max_datarefs_for_datadeps)
2537 return opt_result::failure_at (stmt, "exceeded param "
2538 "loop-max-datarefs-for-datadeps\n");
2540 return opt_result::success ();
2543 /* Look for SLP-only access groups and turn each individual access into its own
2544 group. */
2545 static void
2546 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2548 unsigned int i;
2549 struct data_reference *dr;
2551 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2553 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2554 FOR_EACH_VEC_ELT (datarefs, i, dr)
2556 gcc_assert (DR_REF (dr));
2557 stmt_vec_info stmt_info
2558 = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2560 /* Check if the load is a part of an interleaving chain. */
2561 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2563 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2564 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2565 unsigned int group_size = DR_GROUP_SIZE (first_element);
2567 /* Check if SLP-only groups. */
2568 if (!STMT_SLP_TYPE (stmt_info)
2569 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2571 /* Dissolve the group. */
2572 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2574 stmt_vec_info vinfo = first_element;
2575 while (vinfo)
2577 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2578 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2579 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2580 DR_GROUP_SIZE (vinfo) = 1;
2581 if (STMT_VINFO_STRIDED_P (first_element)
2582 /* We cannot handle stores with gaps. */
2583 || DR_IS_WRITE (dr_info->dr))
2585 STMT_VINFO_STRIDED_P (vinfo) = true;
2586 DR_GROUP_GAP (vinfo) = 0;
2588 else
2589 DR_GROUP_GAP (vinfo) = group_size - 1;
2590 /* Duplicate and adjust alignment info, it needs to
2591 be present on each group leader, see dr_misalignment. */
2592 if (vinfo != first_element)
2594 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2595 dr_info2->target_alignment = dr_info->target_alignment;
2596 int misalignment = dr_info->misalignment;
2597 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2599 HOST_WIDE_INT diff
2600 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2601 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2602 unsigned HOST_WIDE_INT align_c
2603 = dr_info->target_alignment.to_constant ();
2604 misalignment = (misalignment + diff) % align_c;
2606 dr_info2->misalignment = misalignment;
2608 vinfo = next;
2615 /* Determine if operating on full vectors for LOOP_VINFO might leave
2616 some scalar iterations still to do. If so, decide how we should
2617 handle those scalar iterations. The possibilities are:
2619 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2620 In this case:
2622 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2623 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2624 LOOP_VINFO_PEELING_FOR_NITER == false
2626 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2627 to handle the remaining scalar iterations. In this case:
2629 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2630 LOOP_VINFO_PEELING_FOR_NITER == true
2632 There are two choices:
2634 (2a) Consider vectorizing the epilogue loop at the same VF as the
2635 main loop, but using partial vectors instead of full vectors.
2636 In this case:
2638 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2640 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2641 In this case:
2643 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2646 opt_result
2647 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2649 /* Determine whether there would be any scalar iterations left over. */
2650 bool need_peeling_or_partial_vectors_p
2651 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2653 /* Decide whether to vectorize the loop with partial vectors. */
2654 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2655 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2656 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2657 && need_peeling_or_partial_vectors_p)
2659 /* For partial-vector-usage=1, try to push the handling of partial
2660 vectors to the epilogue, with the main loop continuing to operate
2661 on full vectors.
2663 If we are unrolling we also do not want to use partial vectors. This
2664 is to avoid the overhead of generating multiple masks and also to
2665 avoid having to execute entire iterations of FALSE masked instructions
2666 when dealing with one or less full iterations.
2668 ??? We could then end up failing to use partial vectors if we
2669 decide to peel iterations into a prologue, and if the main loop
2670 then ends up processing fewer than VF iterations. */
2671 if ((param_vect_partial_vector_usage == 1
2672 || loop_vinfo->suggested_unroll_factor > 1)
2673 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2674 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2675 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2676 else
2677 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "operating on %s vectors%s.\n",
2683 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2684 ? "partial" : "full",
2685 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2686 ? " for epilogue loop" : "");
2688 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2689 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2690 && need_peeling_or_partial_vectors_p);
2692 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2693 analysis that we don't know whether the loop is vectorized by partial
2694 vectors (More details see tree-vect-loop-manip.cc).
2696 However, SELECT_VL vectorizaton style should only applied on partial
2697 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2698 number of elements to be process for each iteration.
2700 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2701 if it is not partial vectorized loop. */
2702 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2703 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2705 return opt_result::success ();
2708 /* Function vect_analyze_loop_2.
2710 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2711 analyses will record information in some members of LOOP_VINFO. FATAL
2712 indicates if some analysis meets fatal error. If one non-NULL pointer
2713 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2714 worked out suggested unroll factor, while one NULL pointer shows it's
2715 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2716 is to hold the slp decision when the suggested unroll factor is worked
2717 out. */
2718 static opt_result
2719 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2720 unsigned *suggested_unroll_factor,
2721 bool& slp_done_for_suggested_uf)
2723 opt_result ok = opt_result::success ();
2724 int res;
2725 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2726 poly_uint64 min_vf = 2;
2727 loop_vec_info orig_loop_vinfo = NULL;
2729 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2730 loop_vec_info of the first vectorized loop. */
2731 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2732 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2733 else
2734 orig_loop_vinfo = loop_vinfo;
2735 gcc_assert (orig_loop_vinfo);
2737 /* The first group of checks is independent of the vector size. */
2738 fatal = true;
2740 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2741 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2742 return opt_result::failure_at (vect_location,
2743 "not vectorized: simd if(0)\n");
2745 /* Find all data references in the loop (which correspond to vdefs/vuses)
2746 and analyze their evolution in the loop. */
2748 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2750 /* Gather the data references and count stmts in the loop. */
2751 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2753 opt_result res
2754 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2755 &LOOP_VINFO_DATAREFS (loop_vinfo),
2756 &LOOP_VINFO_N_STMTS (loop_vinfo));
2757 if (!res)
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2761 "not vectorized: loop contains function "
2762 "calls or data references that cannot "
2763 "be analyzed\n");
2764 return res;
2766 loop_vinfo->shared->save_datarefs ();
2768 else
2769 loop_vinfo->shared->check_datarefs ();
2771 /* Analyze the data references and also adjust the minimal
2772 vectorization factor according to the loads and stores. */
2774 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2775 if (!ok)
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 "bad data references.\n");
2780 return ok;
2783 /* Check if we are applying unroll factor now. */
2784 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2785 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2787 /* If the slp decision is false when suggested unroll factor is worked
2788 out, and we are applying suggested unroll factor, we can simply skip
2789 all slp related analyses this time. */
2790 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2792 /* Classify all cross-iteration scalar data-flow cycles.
2793 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2794 vect_analyze_scalar_cycles (loop_vinfo, slp);
2796 vect_pattern_recog (loop_vinfo);
2798 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2800 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2801 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2803 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2804 if (!ok)
2806 if (dump_enabled_p ())
2807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2808 "bad data access.\n");
2809 return ok;
2812 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2814 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2815 if (!ok)
2817 if (dump_enabled_p ())
2818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2819 "unexpected pattern.\n");
2820 return ok;
2823 /* While the rest of the analysis below depends on it in some way. */
2824 fatal = false;
2826 /* Analyze data dependences between the data-refs in the loop
2827 and adjust the maximum vectorization factor according to
2828 the dependences.
2829 FORNOW: fail at the first data dependence that we encounter. */
2831 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2832 if (!ok)
2834 if (dump_enabled_p ())
2835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2836 "bad data dependence.\n");
2837 return ok;
2839 if (max_vf != MAX_VECTORIZATION_FACTOR
2840 && maybe_lt (max_vf, min_vf))
2841 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2842 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2844 ok = vect_determine_vectorization_factor (loop_vinfo);
2845 if (!ok)
2847 if (dump_enabled_p ())
2848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2849 "can't determine vectorization factor.\n");
2850 return ok;
2853 /* Compute the scalar iteration cost. */
2854 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2856 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2858 if (slp)
2860 /* Check the SLP opportunities in the loop, analyze and build
2861 SLP trees. */
2862 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2863 if (!ok)
2864 return ok;
2866 /* If there are any SLP instances mark them as pure_slp. */
2867 slp = vect_make_slp_decision (loop_vinfo);
2868 if (slp)
2870 /* Find stmts that need to be both vectorized and SLPed. */
2871 vect_detect_hybrid_slp (loop_vinfo);
2873 /* Update the vectorization factor based on the SLP decision. */
2874 vect_update_vf_for_slp (loop_vinfo);
2876 /* Optimize the SLP graph with the vectorization factor fixed. */
2877 vect_optimize_slp (loop_vinfo);
2879 /* Gather the loads reachable from the SLP graph entries. */
2880 vect_gather_slp_loads (loop_vinfo);
2884 bool saved_can_use_partial_vectors_p
2885 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2887 /* We don't expect to have to roll back to anything other than an empty
2888 set of rgroups. */
2889 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2891 /* This is the point where we can re-start analysis with SLP forced off. */
2892 start_over:
2894 /* Apply the suggested unrolling factor, this was determined by the backend
2895 during finish_cost the first time we ran the analyzis for this
2896 vector mode. */
2897 if (applying_suggested_uf)
2898 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2900 /* Now the vectorization factor is final. */
2901 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2902 gcc_assert (known_ne (vectorization_factor, 0U));
2904 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE, vect_location,
2907 "vectorization_factor = ");
2908 dump_dec (MSG_NOTE, vectorization_factor);
2909 dump_printf (MSG_NOTE, ", niters = %wd\n",
2910 LOOP_VINFO_INT_NITERS (loop_vinfo));
2913 if (max_vf != MAX_VECTORIZATION_FACTOR
2914 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2915 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2917 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2919 /* Analyze the alignment of the data-refs in the loop.
2920 Fail if a data reference is found that cannot be vectorized. */
2922 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2923 if (!ok)
2925 if (dump_enabled_p ())
2926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927 "bad data alignment.\n");
2928 return ok;
2931 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2932 It is important to call pruning after vect_analyze_data_ref_accesses,
2933 since we use grouping information gathered by interleaving analysis. */
2934 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2935 if (!ok)
2936 return ok;
2938 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2939 vectorization, since we do not want to add extra peeling or
2940 add versioning for alignment. */
2941 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2942 /* This pass will decide on using loop versioning and/or loop peeling in
2943 order to enhance the alignment of data references in the loop. */
2944 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2945 if (!ok)
2946 return ok;
2948 if (slp)
2950 /* Analyze operations in the SLP instances. We can't simply
2951 remove unsupported SLP instances as this makes the above
2952 SLP kind detection invalid and might also affect the VF. */
2953 if (! vect_slp_analyze_operations (loop_vinfo))
2955 ok = opt_result::failure_at (vect_location,
2956 "unsupported SLP instances\n");
2957 goto again;
2961 /* Dissolve SLP-only groups. */
2962 vect_dissolve_slp_only_groups (loop_vinfo);
2964 /* Scan all the remaining operations in the loop that are not subject
2965 to SLP and make sure they are vectorizable. */
2966 ok = vect_analyze_loop_operations (loop_vinfo);
2967 if (!ok)
2969 if (dump_enabled_p ())
2970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971 "bad operation or unsupported loop bound.\n");
2972 return ok;
2975 /* For now, we don't expect to mix both masking and length approaches for one
2976 loop, disable it if both are recorded. */
2977 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2978 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2979 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2981 if (dump_enabled_p ())
2982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2983 "can't vectorize a loop with partial vectors"
2984 " because we don't expect to mix different"
2985 " approaches with partial vectors for the"
2986 " same loop.\n");
2987 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2990 /* If we still have the option of using partial vectors,
2991 check whether we can generate the necessary loop controls. */
2992 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2994 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2996 if (!vect_verify_full_masking (loop_vinfo)
2997 && !vect_verify_full_masking_avx512 (loop_vinfo))
2998 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3000 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3001 if (!vect_verify_loop_lens (loop_vinfo))
3002 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3005 /* If we're vectorizing a loop that uses length "controls" and
3006 can iterate more than once, we apply decrementing IV approach
3007 in loop control. */
3008 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3009 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3010 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3011 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3012 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3013 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3014 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3016 /* If a loop uses length controls and has a decrementing loop control IV,
3017 we will normally pass that IV through a MIN_EXPR to calcaluate the
3018 basis for the length controls. E.g. in a loop that processes one
3019 element per scalar iteration, the number of elements would be
3020 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3022 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3023 step, since only the final iteration of the vector loop can have
3024 inactive lanes.
3026 However, some targets have a dedicated instruction for calculating the
3027 preferred length, given the total number of elements that still need to
3028 be processed. This is encapsulated in the SELECT_VL internal function.
3030 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3031 to determine the basis for the length controls. However, unlike the
3032 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3033 lanes inactive in any iteration of the vector loop, not just the last
3034 iteration. This SELECT_VL approach therefore requires us to use pointer
3035 IVs with variable steps.
3037 Once we've decided how many elements should be processed by one
3038 iteration of the vector loop, we need to populate the rgroup controls.
3039 If a loop has multiple rgroups, we need to make sure that those rgroups
3040 "line up" (that is, they must be consistent about which elements are
3041 active and which aren't). This is done by vect_adjust_loop_lens_control.
3043 In principle, it would be possible to use vect_adjust_loop_lens_control
3044 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3045 However:
3047 (1) In practice, it only makes sense to use SELECT_VL when a vector
3048 operation will be controlled directly by the result. It is not
3049 worth using SELECT_VL if it would only be the input to other
3050 calculations.
3052 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3053 pointer IV will need N updates by a variable amount (N-1 updates
3054 within the iteration and 1 update to move to the next iteration).
3056 Because of this, we prefer to use the MIN_EXPR approach whenever there
3057 is more than one length control.
3059 In addition, SELECT_VL always operates to a granularity of 1 unit.
3060 If we wanted to use it to control an SLP operation on N consecutive
3061 elements, we would need to make the SELECT_VL inputs measure scalar
3062 iterations (rather than elements) and then multiply the SELECT_VL
3063 result by N. But using SELECT_VL this way is inefficient because
3064 of (1) above.
3066 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3067 satisfied:
3069 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3070 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3072 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3073 we will fail to gain benefits of following unroll optimizations. We prefer
3074 using the MIN_EXPR approach in this situation. */
3075 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3077 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3078 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3079 OPTIMIZE_FOR_SPEED)
3080 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3081 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3082 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3083 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3084 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3087 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3088 assuming that the loop will be used as a main loop. We will redo
3089 this analysis later if we instead decide to use the loop as an
3090 epilogue loop. */
3091 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3092 if (!ok)
3093 return ok;
3095 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3096 to be able to handle fewer than VF scalars, or needs to have a lower VF
3097 than the main loop. */
3098 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3099 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101 poly_uint64 unscaled_vf
3102 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3103 orig_loop_vinfo->suggested_unroll_factor);
3104 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3105 return opt_result::failure_at (vect_location,
3106 "Vectorization factor too high for"
3107 " epilogue loop.\n");
3110 /* Check the costings of the loop make vectorizing worthwhile. */
3111 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3112 if (res < 0)
3114 ok = opt_result::failure_at (vect_location,
3115 "Loop costings may not be worthwhile.\n");
3116 goto again;
3118 if (!res)
3119 return opt_result::failure_at (vect_location,
3120 "Loop costings not worthwhile.\n");
3122 /* If an epilogue loop is required make sure we can create one. */
3123 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3124 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3125 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3127 if (dump_enabled_p ())
3128 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3129 if (!vect_can_advance_ivs_p (loop_vinfo)
3130 || !slpeel_can_duplicate_loop_p (loop,
3131 LOOP_VINFO_IV_EXIT (loop_vinfo),
3132 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3134 ok = opt_result::failure_at (vect_location,
3135 "not vectorized: can't create required "
3136 "epilog loop\n");
3137 goto again;
3141 /* During peeling, we need to check if number of loop iterations is
3142 enough for both peeled prolog loop and vector loop. This check
3143 can be merged along with threshold check of loop versioning, so
3144 increase threshold for this case if necessary.
3146 If we are analyzing an epilogue we still want to check what its
3147 versioning threshold would be. If we decide to vectorize the epilogues we
3148 will want to use the lowest versioning threshold of all epilogues and main
3149 loop. This will enable us to enter a vectorized epilogue even when
3150 versioning the loop. We can't simply check whether the epilogue requires
3151 versioning though since we may have skipped some versioning checks when
3152 analyzing the epilogue. For instance, checks for alias versioning will be
3153 skipped when dealing with epilogues as we assume we already checked them
3154 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3155 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3157 poly_uint64 niters_th = 0;
3158 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3160 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3162 /* Niters for peeled prolog loop. */
3163 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3165 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3166 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3167 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3169 else
3170 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3173 /* Niters for at least one iteration of vectorized loop. */
3174 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3176 /* One additional iteration because of peeling for gap. */
3177 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3178 niters_th += 1;
3180 /* Use the same condition as vect_transform_loop to decide when to use
3181 the cost to determine a versioning threshold. */
3182 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3183 && ordered_p (th, niters_th))
3184 niters_th = ordered_max (poly_uint64 (th), niters_th);
3186 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3189 gcc_assert (known_eq (vectorization_factor,
3190 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3192 slp_done_for_suggested_uf = slp;
3194 /* Ok to vectorize! */
3195 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3196 return opt_result::success ();
3198 again:
3199 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3200 gcc_assert (!ok);
3202 /* Try again with SLP forced off but if we didn't do any SLP there is
3203 no point in re-trying. */
3204 if (!slp)
3205 return ok;
3207 /* If the slp decision is true when suggested unroll factor is worked
3208 out, and we are applying suggested unroll factor, we don't need to
3209 re-try any more. */
3210 if (applying_suggested_uf && slp_done_for_suggested_uf)
3211 return ok;
3213 /* If there are reduction chains re-trying will fail anyway. */
3214 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3215 return ok;
3217 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3218 via interleaving or lane instructions. */
3219 slp_instance instance;
3220 slp_tree node;
3221 unsigned i, j;
3222 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3224 stmt_vec_info vinfo;
3225 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3226 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3227 continue;
3228 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3229 unsigned int size = DR_GROUP_SIZE (vinfo);
3230 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3231 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3232 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3233 && ! vect_grouped_store_supported (vectype, size))
3234 return opt_result::failure_at (vinfo->stmt,
3235 "unsupported grouped store\n");
3236 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3238 vinfo = SLP_TREE_REPRESENTATIVE (node);
3239 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3241 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3242 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3243 size = DR_GROUP_SIZE (vinfo);
3244 vectype = STMT_VINFO_VECTYPE (vinfo);
3245 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3246 && ! vect_grouped_load_supported (vectype, single_element_p,
3247 size))
3248 return opt_result::failure_at (vinfo->stmt,
3249 "unsupported grouped load\n");
3254 if (dump_enabled_p ())
3255 dump_printf_loc (MSG_NOTE, vect_location,
3256 "re-trying with SLP disabled\n");
3258 /* Roll back state appropriately. No SLP this time. */
3259 slp = false;
3260 /* Restore vectorization factor as it were without SLP. */
3261 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3262 /* Free the SLP instances. */
3263 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3264 vect_free_slp_instance (instance);
3265 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3266 /* Reset SLP type to loop_vect on all stmts. */
3267 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3269 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3270 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3271 !gsi_end_p (si); gsi_next (&si))
3273 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3274 STMT_SLP_TYPE (stmt_info) = loop_vect;
3275 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3276 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3278 /* vectorizable_reduction adjusts reduction stmt def-types,
3279 restore them to that of the PHI. */
3280 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3281 = STMT_VINFO_DEF_TYPE (stmt_info);
3282 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3283 (STMT_VINFO_REDUC_DEF (stmt_info)))
3284 = STMT_VINFO_DEF_TYPE (stmt_info);
3287 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3288 !gsi_end_p (si); gsi_next (&si))
3290 if (is_gimple_debug (gsi_stmt (si)))
3291 continue;
3292 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3293 STMT_SLP_TYPE (stmt_info) = loop_vect;
3294 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3296 stmt_vec_info pattern_stmt_info
3297 = STMT_VINFO_RELATED_STMT (stmt_info);
3298 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3299 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3301 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3302 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3303 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3304 !gsi_end_p (pi); gsi_next (&pi))
3305 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3306 = loop_vect;
3310 /* Free optimized alias test DDRS. */
3311 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3312 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3313 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3314 /* Reset target cost data. */
3315 delete loop_vinfo->vector_costs;
3316 loop_vinfo->vector_costs = nullptr;
3317 /* Reset accumulated rgroup information. */
3318 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3319 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3320 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3321 /* Reset assorted flags. */
3322 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3323 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3324 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3325 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3326 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3327 = saved_can_use_partial_vectors_p;
3328 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3330 goto start_over;
3333 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3334 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3335 OLD_LOOP_VINFO is better unless something specifically indicates
3336 otherwise.
3338 Note that this deliberately isn't a partial order. */
3340 static bool
3341 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3342 loop_vec_info old_loop_vinfo)
3344 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3345 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3347 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3348 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3350 /* Always prefer a VF of loop->simdlen over any other VF. */
3351 if (loop->simdlen)
3353 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3354 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3355 if (new_simdlen_p != old_simdlen_p)
3356 return new_simdlen_p;
3359 const auto *old_costs = old_loop_vinfo->vector_costs;
3360 const auto *new_costs = new_loop_vinfo->vector_costs;
3361 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3362 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3364 return new_costs->better_main_loop_than_p (old_costs);
3367 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3368 true if we should. */
3370 static bool
3371 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3372 loop_vec_info old_loop_vinfo)
3374 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3375 return false;
3377 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_NOTE, vect_location,
3379 "***** Preferring vector mode %s to vector mode %s\n",
3380 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3381 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3382 return true;
3385 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3386 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3387 MODE_I to the next mode useful to analyze.
3388 Return the loop_vinfo on success and wrapped null on failure. */
3390 static opt_loop_vec_info
3391 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3392 const vect_loop_form_info *loop_form_info,
3393 loop_vec_info main_loop_vinfo,
3394 const vector_modes &vector_modes, unsigned &mode_i,
3395 machine_mode &autodetected_vector_mode,
3396 bool &fatal)
3398 loop_vec_info loop_vinfo
3399 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3401 machine_mode vector_mode = vector_modes[mode_i];
3402 loop_vinfo->vector_mode = vector_mode;
3403 unsigned int suggested_unroll_factor = 1;
3404 bool slp_done_for_suggested_uf = false;
3406 /* Run the main analysis. */
3407 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3408 &suggested_unroll_factor,
3409 slp_done_for_suggested_uf);
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 "***** Analysis %s with vector mode %s\n",
3413 res ? "succeeded" : "failed",
3414 GET_MODE_NAME (loop_vinfo->vector_mode));
3416 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3418 if (dump_enabled_p ())
3419 dump_printf_loc (MSG_NOTE, vect_location,
3420 "***** Re-trying analysis for unrolling"
3421 " with unroll factor %d and slp %s.\n",
3422 suggested_unroll_factor,
3423 slp_done_for_suggested_uf ? "on" : "off");
3424 loop_vec_info unroll_vinfo
3425 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3426 unroll_vinfo->vector_mode = vector_mode;
3427 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3428 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3429 slp_done_for_suggested_uf);
3430 if (new_res)
3432 delete loop_vinfo;
3433 loop_vinfo = unroll_vinfo;
3435 else
3436 delete unroll_vinfo;
3439 /* Remember the autodetected vector mode. */
3440 if (vector_mode == VOIDmode)
3441 autodetected_vector_mode = loop_vinfo->vector_mode;
3443 /* Advance mode_i, first skipping modes that would result in the
3444 same analysis result. */
3445 while (mode_i + 1 < vector_modes.length ()
3446 && vect_chooses_same_modes_p (loop_vinfo,
3447 vector_modes[mode_i + 1]))
3449 if (dump_enabled_p ())
3450 dump_printf_loc (MSG_NOTE, vect_location,
3451 "***** The result for vector mode %s would"
3452 " be the same\n",
3453 GET_MODE_NAME (vector_modes[mode_i + 1]));
3454 mode_i += 1;
3456 if (mode_i + 1 < vector_modes.length ()
3457 && VECTOR_MODE_P (autodetected_vector_mode)
3458 && (related_vector_mode (vector_modes[mode_i + 1],
3459 GET_MODE_INNER (autodetected_vector_mode))
3460 == autodetected_vector_mode)
3461 && (related_vector_mode (autodetected_vector_mode,
3462 GET_MODE_INNER (vector_modes[mode_i + 1]))
3463 == vector_modes[mode_i + 1]))
3465 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_NOTE, vect_location,
3467 "***** Skipping vector mode %s, which would"
3468 " repeat the analysis for %s\n",
3469 GET_MODE_NAME (vector_modes[mode_i + 1]),
3470 GET_MODE_NAME (autodetected_vector_mode));
3471 mode_i += 1;
3473 mode_i++;
3475 if (!res)
3477 delete loop_vinfo;
3478 if (fatal)
3479 gcc_checking_assert (main_loop_vinfo == NULL);
3480 return opt_loop_vec_info::propagate_failure (res);
3483 return opt_loop_vec_info::success (loop_vinfo);
3486 /* Function vect_analyze_loop.
3488 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3489 for it. The different analyses will record information in the
3490 loop_vec_info struct. */
3491 opt_loop_vec_info
3492 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3494 DUMP_VECT_SCOPE ("analyze_loop_nest");
3496 if (loop_outer (loop)
3497 && loop_vec_info_for_loop (loop_outer (loop))
3498 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3499 return opt_loop_vec_info::failure_at (vect_location,
3500 "outer-loop already vectorized.\n");
3502 if (!find_loop_nest (loop, &shared->loop_nest))
3503 return opt_loop_vec_info::failure_at
3504 (vect_location,
3505 "not vectorized: loop nest containing two or more consecutive inner"
3506 " loops cannot be vectorized\n");
3508 /* Analyze the loop form. */
3509 vect_loop_form_info loop_form_info;
3510 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3511 if (!res)
3513 if (dump_enabled_p ())
3514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3515 "bad loop form.\n");
3516 return opt_loop_vec_info::propagate_failure (res);
3518 if (!integer_onep (loop_form_info.assumptions))
3520 /* We consider to vectorize this loop by versioning it under
3521 some assumptions. In order to do this, we need to clear
3522 existing information computed by scev and niter analyzer. */
3523 scev_reset_htab ();
3524 free_numbers_of_iterations_estimates (loop);
3525 /* Also set flag for this loop so that following scev and niter
3526 analysis are done under the assumptions. */
3527 loop_constraint_set (loop, LOOP_C_FINITE);
3529 else
3530 /* Clear the existing niter information to make sure the nonwrapping flag
3531 will be calculated and set propriately. */
3532 free_numbers_of_iterations_estimates (loop);
3534 auto_vector_modes vector_modes;
3535 /* Autodetect first vector size we try. */
3536 vector_modes.safe_push (VOIDmode);
3537 unsigned int autovec_flags
3538 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3539 loop->simdlen != 0);
3540 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3541 && !unlimited_cost_model (loop));
3542 machine_mode autodetected_vector_mode = VOIDmode;
3543 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3544 unsigned int mode_i = 0;
3545 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3547 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3548 a mode has not been analyzed. */
3549 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3550 for (unsigned i = 0; i < vector_modes.length (); ++i)
3551 cached_vf_per_mode.safe_push (0);
3553 /* First determine the main loop vectorization mode, either the first
3554 one that works, starting with auto-detecting the vector mode and then
3555 following the targets order of preference, or the one with the
3556 lowest cost if pick_lowest_cost_p. */
3557 while (1)
3559 bool fatal;
3560 unsigned int last_mode_i = mode_i;
3561 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3562 failed. */
3563 cached_vf_per_mode[last_mode_i] = -1;
3564 opt_loop_vec_info loop_vinfo
3565 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3566 NULL, vector_modes, mode_i,
3567 autodetected_vector_mode, fatal);
3568 if (fatal)
3569 break;
3571 if (loop_vinfo)
3573 /* Analyzis has been successful so update the VF value. The
3574 VF should always be a multiple of unroll_factor and we want to
3575 capture the original VF here. */
3576 cached_vf_per_mode[last_mode_i]
3577 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3578 loop_vinfo->suggested_unroll_factor);
3579 /* Once we hit the desired simdlen for the first time,
3580 discard any previous attempts. */
3581 if (simdlen
3582 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3584 delete first_loop_vinfo;
3585 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3586 simdlen = 0;
3588 else if (pick_lowest_cost_p
3589 && first_loop_vinfo
3590 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3592 /* Pick loop_vinfo over first_loop_vinfo. */
3593 delete first_loop_vinfo;
3594 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3596 if (first_loop_vinfo == NULL)
3597 first_loop_vinfo = loop_vinfo;
3598 else
3600 delete loop_vinfo;
3601 loop_vinfo = opt_loop_vec_info::success (NULL);
3604 /* Commit to first_loop_vinfo if we have no reason to try
3605 alternatives. */
3606 if (!simdlen && !pick_lowest_cost_p)
3607 break;
3609 if (mode_i == vector_modes.length ()
3610 || autodetected_vector_mode == VOIDmode)
3611 break;
3613 /* Try the next biggest vector size. */
3614 if (dump_enabled_p ())
3615 dump_printf_loc (MSG_NOTE, vect_location,
3616 "***** Re-trying analysis with vector mode %s\n",
3617 GET_MODE_NAME (vector_modes[mode_i]));
3619 if (!first_loop_vinfo)
3620 return opt_loop_vec_info::propagate_failure (res);
3622 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_NOTE, vect_location,
3624 "***** Choosing vector mode %s\n",
3625 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3627 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3628 enabled, SIMDUID is not set, it is the innermost loop and we have
3629 either already found the loop's SIMDLEN or there was no SIMDLEN to
3630 begin with.
3631 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3632 bool vect_epilogues = (!simdlen
3633 && loop->inner == NULL
3634 && param_vect_epilogues_nomask
3635 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3636 /* No code motion support for multiple epilogues so for now
3637 not supported when multiple exits. */
3638 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3639 && !loop->simduid);
3640 if (!vect_epilogues)
3641 return first_loop_vinfo;
3643 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3644 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3646 /* For epilogues start the analysis from the first mode. The motivation
3647 behind starting from the beginning comes from cases where the VECTOR_MODES
3648 array may contain length-agnostic and length-specific modes. Their
3649 ordering is not guaranteed, so we could end up picking a mode for the main
3650 loop that is after the epilogue's optimal mode. */
3651 vector_modes[0] = autodetected_vector_mode;
3652 mode_i = 0;
3654 bool supports_partial_vectors =
3655 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3656 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3658 while (1)
3660 /* If the target does not support partial vectors we can shorten the
3661 number of modes to analyze for the epilogue as we know we can't pick a
3662 mode that would lead to a VF at least as big as the
3663 FIRST_VINFO_VF. */
3664 if (!supports_partial_vectors
3665 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3667 mode_i++;
3668 if (mode_i == vector_modes.length ())
3669 break;
3670 continue;
3673 if (dump_enabled_p ())
3674 dump_printf_loc (MSG_NOTE, vect_location,
3675 "***** Re-trying epilogue analysis with vector "
3676 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3678 bool fatal;
3679 opt_loop_vec_info loop_vinfo
3680 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3681 first_loop_vinfo,
3682 vector_modes, mode_i,
3683 autodetected_vector_mode, fatal);
3684 if (fatal)
3685 break;
3687 if (loop_vinfo)
3689 if (pick_lowest_cost_p)
3691 /* Keep trying to roll back vectorization attempts while the
3692 loop_vec_infos they produced were worse than this one. */
3693 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3694 while (!vinfos.is_empty ()
3695 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3697 gcc_assert (vect_epilogues);
3698 delete vinfos.pop ();
3701 /* For now only allow one epilogue loop. */
3702 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3704 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3705 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3706 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3707 || maybe_ne (lowest_th, 0U));
3708 /* Keep track of the known smallest versioning
3709 threshold. */
3710 if (ordered_p (lowest_th, th))
3711 lowest_th = ordered_min (lowest_th, th);
3713 else
3715 delete loop_vinfo;
3716 loop_vinfo = opt_loop_vec_info::success (NULL);
3719 /* For now only allow one epilogue loop, but allow
3720 pick_lowest_cost_p to replace it, so commit to the
3721 first epilogue if we have no reason to try alternatives. */
3722 if (!pick_lowest_cost_p)
3723 break;
3726 if (mode_i == vector_modes.length ())
3727 break;
3731 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3733 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3734 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_NOTE, vect_location,
3736 "***** Choosing epilogue vector mode %s\n",
3737 GET_MODE_NAME
3738 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3741 return first_loop_vinfo;
3744 /* Return true if there is an in-order reduction function for CODE, storing
3745 it in *REDUC_FN if so. */
3747 static bool
3748 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3750 /* We support MINUS_EXPR by negating the operand. This also preserves an
3751 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3752 (-0.0) = -0.0. */
3753 if (code == PLUS_EXPR || code == MINUS_EXPR)
3755 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3756 return true;
3758 return false;
3761 /* Function reduction_fn_for_scalar_code
3763 Input:
3764 CODE - tree_code of a reduction operations.
3766 Output:
3767 REDUC_FN - the corresponding internal function to be used to reduce the
3768 vector of partial results into a single scalar result, or IFN_LAST
3769 if the operation is a supported reduction operation, but does not have
3770 such an internal function.
3772 Return FALSE if CODE currently cannot be vectorized as reduction. */
3774 bool
3775 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3777 if (code.is_tree_code ())
3778 switch (tree_code (code))
3780 case MAX_EXPR:
3781 *reduc_fn = IFN_REDUC_MAX;
3782 return true;
3784 case MIN_EXPR:
3785 *reduc_fn = IFN_REDUC_MIN;
3786 return true;
3788 case PLUS_EXPR:
3789 *reduc_fn = IFN_REDUC_PLUS;
3790 return true;
3792 case BIT_AND_EXPR:
3793 *reduc_fn = IFN_REDUC_AND;
3794 return true;
3796 case BIT_IOR_EXPR:
3797 *reduc_fn = IFN_REDUC_IOR;
3798 return true;
3800 case BIT_XOR_EXPR:
3801 *reduc_fn = IFN_REDUC_XOR;
3802 return true;
3804 case MULT_EXPR:
3805 case MINUS_EXPR:
3806 *reduc_fn = IFN_LAST;
3807 return true;
3809 default:
3810 return false;
3812 else
3813 switch (combined_fn (code))
3815 CASE_CFN_FMAX:
3816 *reduc_fn = IFN_REDUC_FMAX;
3817 return true;
3819 CASE_CFN_FMIN:
3820 *reduc_fn = IFN_REDUC_FMIN;
3821 return true;
3823 default:
3824 return false;
3828 /* If there is a neutral value X such that a reduction would not be affected
3829 by the introduction of additional X elements, return that X, otherwise
3830 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3831 of the scalar elements. If the reduction has just a single initial value
3832 then INITIAL_VALUE is that value, otherwise it is null.
3833 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3834 In that case no signed zero is returned. */
3836 tree
3837 neutral_op_for_reduction (tree scalar_type, code_helper code,
3838 tree initial_value, bool as_initial)
3840 if (code.is_tree_code ())
3841 switch (tree_code (code))
3843 case DOT_PROD_EXPR:
3844 case SAD_EXPR:
3845 case MINUS_EXPR:
3846 case BIT_IOR_EXPR:
3847 case BIT_XOR_EXPR:
3848 return build_zero_cst (scalar_type);
3849 case WIDEN_SUM_EXPR:
3850 case PLUS_EXPR:
3851 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3852 return build_real (scalar_type, dconstm0);
3853 else
3854 return build_zero_cst (scalar_type);
3856 case MULT_EXPR:
3857 return build_one_cst (scalar_type);
3859 case BIT_AND_EXPR:
3860 return build_all_ones_cst (scalar_type);
3862 case MAX_EXPR:
3863 case MIN_EXPR:
3864 return initial_value;
3866 default:
3867 return NULL_TREE;
3869 else
3870 switch (combined_fn (code))
3872 CASE_CFN_FMIN:
3873 CASE_CFN_FMAX:
3874 return initial_value;
3876 default:
3877 return NULL_TREE;
3881 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3882 STMT is printed with a message MSG. */
3884 static void
3885 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3887 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3890 /* Return true if we need an in-order reduction for operation CODE
3891 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3892 overflow must wrap. */
3894 bool
3895 needs_fold_left_reduction_p (tree type, code_helper code)
3897 /* CHECKME: check for !flag_finite_math_only too? */
3898 if (SCALAR_FLOAT_TYPE_P (type))
3900 if (code.is_tree_code ())
3901 switch (tree_code (code))
3903 case MIN_EXPR:
3904 case MAX_EXPR:
3905 return false;
3907 default:
3908 return !flag_associative_math;
3910 else
3911 switch (combined_fn (code))
3913 CASE_CFN_FMIN:
3914 CASE_CFN_FMAX:
3915 return false;
3917 default:
3918 return !flag_associative_math;
3922 if (INTEGRAL_TYPE_P (type))
3923 return (!code.is_tree_code ()
3924 || !operation_no_trapping_overflow (type, tree_code (code)));
3926 if (SAT_FIXED_POINT_TYPE_P (type))
3927 return true;
3929 return false;
3932 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3933 has a handled computation expression. Store the main reduction
3934 operation in *CODE. */
3936 static bool
3937 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3938 tree loop_arg, code_helper *code,
3939 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3941 auto_bitmap visited;
3942 tree lookfor = PHI_RESULT (phi);
3943 ssa_op_iter curri;
3944 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3945 while (USE_FROM_PTR (curr) != loop_arg)
3946 curr = op_iter_next_use (&curri);
3947 curri.i = curri.numops;
3950 path.safe_push (std::make_pair (curri, curr));
3951 tree use = USE_FROM_PTR (curr);
3952 if (use == lookfor)
3953 break;
3954 gimple *def = SSA_NAME_DEF_STMT (use);
3955 if (gimple_nop_p (def)
3956 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3958 pop:
3961 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3962 curri = x.first;
3963 curr = x.second;
3965 curr = op_iter_next_use (&curri);
3966 /* Skip already visited or non-SSA operands (from iterating
3967 over PHI args). */
3968 while (curr != NULL_USE_OPERAND_P
3969 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3970 || ! bitmap_set_bit (visited,
3971 SSA_NAME_VERSION
3972 (USE_FROM_PTR (curr)))));
3974 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3975 if (curr == NULL_USE_OPERAND_P)
3976 break;
3978 else
3980 if (gimple_code (def) == GIMPLE_PHI)
3981 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3982 else
3983 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3984 while (curr != NULL_USE_OPERAND_P
3985 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3986 || ! bitmap_set_bit (visited,
3987 SSA_NAME_VERSION
3988 (USE_FROM_PTR (curr)))))
3989 curr = op_iter_next_use (&curri);
3990 if (curr == NULL_USE_OPERAND_P)
3991 goto pop;
3994 while (1);
3995 if (dump_file && (dump_flags & TDF_DETAILS))
3997 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3998 unsigned i;
3999 std::pair<ssa_op_iter, use_operand_p> *x;
4000 FOR_EACH_VEC_ELT (path, i, x)
4001 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4002 dump_printf (MSG_NOTE, "\n");
4005 /* Check whether the reduction path detected is valid. */
4006 bool fail = path.length () == 0;
4007 bool neg = false;
4008 int sign = -1;
4009 *code = ERROR_MARK;
4010 for (unsigned i = 1; i < path.length (); ++i)
4012 gimple *use_stmt = USE_STMT (path[i].second);
4013 gimple_match_op op;
4014 if (!gimple_extract_op (use_stmt, &op))
4016 fail = true;
4017 break;
4019 unsigned int opi = op.num_ops;
4020 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4022 /* The following make sure we can compute the operand index
4023 easily plus it mostly disallows chaining via COND_EXPR condition
4024 operands. */
4025 for (opi = 0; opi < op.num_ops; ++opi)
4026 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4027 break;
4029 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4031 for (opi = 0; opi < op.num_ops; ++opi)
4032 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4033 break;
4035 if (opi == op.num_ops)
4037 fail = true;
4038 break;
4040 op.code = canonicalize_code (op.code, op.type);
4041 if (op.code == MINUS_EXPR)
4043 op.code = PLUS_EXPR;
4044 /* Track whether we negate the reduction value each iteration. */
4045 if (op.ops[1] == op.ops[opi])
4046 neg = ! neg;
4048 else if (op.code == IFN_COND_SUB)
4050 op.code = IFN_COND_ADD;
4051 /* Track whether we negate the reduction value each iteration. */
4052 if (op.ops[2] == op.ops[opi])
4053 neg = ! neg;
4055 if (CONVERT_EXPR_CODE_P (op.code)
4056 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4058 else if (*code == ERROR_MARK)
4060 *code = op.code;
4061 sign = TYPE_SIGN (op.type);
4063 else if (op.code != *code)
4065 fail = true;
4066 break;
4068 else if ((op.code == MIN_EXPR
4069 || op.code == MAX_EXPR)
4070 && sign != TYPE_SIGN (op.type))
4072 fail = true;
4073 break;
4075 /* Check there's only a single stmt the op is used on. For the
4076 not value-changing tail and the last stmt allow out-of-loop uses.
4077 ??? We could relax this and handle arbitrary live stmts by
4078 forcing a scalar epilogue for example. */
4079 imm_use_iterator imm_iter;
4080 use_operand_p use_p;
4081 gimple *op_use_stmt;
4082 unsigned cnt = 0;
4083 bool cond_fn_p = op.code.is_internal_fn ()
4084 && (conditional_internal_fn_code (internal_fn (op.code))
4085 != ERROR_MARK);
4087 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4089 /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4090 have op1 twice (once as definition, once as else) in the same
4091 operation. Enforce this. */
4092 if (cond_fn_p && op_use_stmt == use_stmt)
4094 gcall *call = as_a<gcall *> (use_stmt);
4095 unsigned else_pos
4096 = internal_fn_else_index (internal_fn (op.code));
4097 if (gimple_call_arg (call, else_pos) != op.ops[opi])
4099 fail = true;
4100 break;
4102 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4104 if (j == else_pos)
4105 continue;
4106 if (gimple_call_arg (call, j) == op.ops[opi])
4107 cnt++;
4110 else if (!is_gimple_debug (op_use_stmt)
4111 && (*code != ERROR_MARK
4112 || flow_bb_inside_loop_p (loop,
4113 gimple_bb (op_use_stmt))))
4114 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4115 cnt++;
4118 if (cnt != 1)
4120 fail = true;
4121 break;
4124 return ! fail && ! neg && *code != ERROR_MARK;
4127 bool
4128 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4129 tree loop_arg, enum tree_code code)
4131 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4132 code_helper code_;
4133 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4134 && code_ == code);
4139 /* Function vect_is_simple_reduction
4141 (1) Detect a cross-iteration def-use cycle that represents a simple
4142 reduction computation. We look for the following pattern:
4144 loop_header:
4145 a1 = phi < a0, a2 >
4146 a3 = ...
4147 a2 = operation (a3, a1)
4151 a3 = ...
4152 loop_header:
4153 a1 = phi < a0, a2 >
4154 a2 = operation (a3, a1)
4156 such that:
4157 1. operation is commutative and associative and it is safe to
4158 change the order of the computation
4159 2. no uses for a2 in the loop (a2 is used out of the loop)
4160 3. no uses of a1 in the loop besides the reduction operation
4161 4. no uses of a1 outside the loop.
4163 Conditions 1,4 are tested here.
4164 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4166 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4167 nested cycles.
4169 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4170 reductions:
4172 a1 = phi < a0, a2 >
4173 inner loop (def of a3)
4174 a2 = phi < a3 >
4176 (4) Detect condition expressions, ie:
4177 for (int i = 0; i < N; i++)
4178 if (a[i] < val)
4179 ret_val = a[i];
4183 static stmt_vec_info
4184 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4185 bool *double_reduc, bool *reduc_chain_p, bool slp)
4187 gphi *phi = as_a <gphi *> (phi_info->stmt);
4188 gimple *phi_use_stmt = NULL;
4189 imm_use_iterator imm_iter;
4190 use_operand_p use_p;
4192 *double_reduc = false;
4193 *reduc_chain_p = false;
4194 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4196 tree phi_name = PHI_RESULT (phi);
4197 /* ??? If there are no uses of the PHI result the inner loop reduction
4198 won't be detected as possibly double-reduction by vectorizable_reduction
4199 because that tries to walk the PHI arg from the preheader edge which
4200 can be constant. See PR60382. */
4201 if (has_zero_uses (phi_name))
4202 return NULL;
4203 class loop *loop = (gimple_bb (phi))->loop_father;
4204 unsigned nphi_def_loop_uses = 0;
4205 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4207 gimple *use_stmt = USE_STMT (use_p);
4208 if (is_gimple_debug (use_stmt))
4209 continue;
4211 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4213 if (dump_enabled_p ())
4214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4215 "intermediate value used outside loop.\n");
4217 return NULL;
4220 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4221 op1 twice (once as definition, once as else) in the same operation.
4222 Only count it as one. */
4223 if (use_stmt != phi_use_stmt)
4225 nphi_def_loop_uses++;
4226 phi_use_stmt = use_stmt;
4230 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4231 if (TREE_CODE (latch_def) != SSA_NAME)
4233 if (dump_enabled_p ())
4234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4235 "reduction: not ssa_name: %T\n", latch_def);
4236 return NULL;
4239 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4240 if (!def_stmt_info
4241 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4242 return NULL;
4244 bool nested_in_vect_loop
4245 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4246 unsigned nlatch_def_loop_uses = 0;
4247 auto_vec<gphi *, 3> lcphis;
4248 bool inner_loop_of_double_reduc = false;
4249 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4251 gimple *use_stmt = USE_STMT (use_p);
4252 if (is_gimple_debug (use_stmt))
4253 continue;
4254 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4255 nlatch_def_loop_uses++;
4256 else
4258 /* We can have more than one loop-closed PHI. */
4259 lcphis.safe_push (as_a <gphi *> (use_stmt));
4260 if (nested_in_vect_loop
4261 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4262 == vect_double_reduction_def))
4263 inner_loop_of_double_reduc = true;
4267 /* If we are vectorizing an inner reduction we are executing that
4268 in the original order only in case we are not dealing with a
4269 double reduction. */
4270 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4272 if (dump_enabled_p ())
4273 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4274 "detected nested cycle: ");
4275 return def_stmt_info;
4278 /* When the inner loop of a double reduction ends up with more than
4279 one loop-closed PHI we have failed to classify alternate such
4280 PHIs as double reduction, leading to wrong code. See PR103237. */
4281 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285 "unhandle double reduction\n");
4286 return NULL;
4289 /* If this isn't a nested cycle or if the nested cycle reduction value
4290 is used ouside of the inner loop we cannot handle uses of the reduction
4291 value. */
4292 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4296 "reduction used in loop.\n");
4297 return NULL;
4300 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4301 defined in the inner loop. */
4302 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4304 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4305 if (gimple_phi_num_args (def_stmt) != 1
4306 || TREE_CODE (op1) != SSA_NAME)
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4310 "unsupported phi node definition.\n");
4312 return NULL;
4315 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4316 and the latch definition op1. */
4317 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4318 if (gimple_bb (def1)
4319 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4320 && loop->inner
4321 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4322 && (is_gimple_assign (def1) || is_gimple_call (def1))
4323 && is_a <gphi *> (phi_use_stmt)
4324 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4325 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4326 loop_latch_edge (loop->inner)))
4327 && lcphis.length () == 1)
4329 if (dump_enabled_p ())
4330 report_vect_op (MSG_NOTE, def_stmt,
4331 "detected double reduction: ");
4333 *double_reduc = true;
4334 return def_stmt_info;
4337 return NULL;
4340 /* Look for the expression computing latch_def from then loop PHI result. */
4341 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4342 code_helper code;
4343 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4344 path))
4346 STMT_VINFO_REDUC_CODE (phi_info) = code;
4347 if (code == COND_EXPR && !nested_in_vect_loop)
4348 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4350 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4351 reduction chain for which the additional restriction is that
4352 all operations in the chain are the same. */
4353 auto_vec<stmt_vec_info, 8> reduc_chain;
4354 unsigned i;
4355 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4356 for (i = path.length () - 1; i >= 1; --i)
4358 gimple *stmt = USE_STMT (path[i].second);
4359 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4360 gimple_match_op op;
4361 if (!gimple_extract_op (stmt, &op))
4362 gcc_unreachable ();
4363 if (gassign *assign = dyn_cast<gassign *> (stmt))
4364 STMT_VINFO_REDUC_IDX (stmt_info)
4365 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4366 else
4368 gcall *call = as_a<gcall *> (stmt);
4369 STMT_VINFO_REDUC_IDX (stmt_info)
4370 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4372 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4373 && (i == 1 || i == path.length () - 1));
4374 if ((op.code != code && !leading_conversion)
4375 /* We can only handle the final value in epilogue
4376 generation for reduction chains. */
4377 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4378 is_slp_reduc = false;
4379 /* For reduction chains we support a trailing/leading
4380 conversions. We do not store those in the actual chain. */
4381 if (leading_conversion)
4382 continue;
4383 reduc_chain.safe_push (stmt_info);
4385 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4387 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4389 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4390 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4392 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4393 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4395 /* Save the chain for further analysis in SLP detection. */
4396 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4397 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4399 *reduc_chain_p = true;
4400 if (dump_enabled_p ())
4401 dump_printf_loc (MSG_NOTE, vect_location,
4402 "reduction: detected reduction chain\n");
4404 else if (dump_enabled_p ())
4405 dump_printf_loc (MSG_NOTE, vect_location,
4406 "reduction: detected reduction\n");
4408 return def_stmt_info;
4411 if (dump_enabled_p ())
4412 dump_printf_loc (MSG_NOTE, vect_location,
4413 "reduction: unknown pattern\n");
4415 return NULL;
4418 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4419 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4420 or -1 if not known. */
4422 static int
4423 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4425 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4426 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4428 if (dump_enabled_p ())
4429 dump_printf_loc (MSG_NOTE, vect_location,
4430 "cost model: epilogue peel iters set to vf/2 "
4431 "because loop iterations are unknown .\n");
4432 return assumed_vf / 2;
4434 else
4436 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4437 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4438 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4439 /* If we need to peel for gaps, but no peeling is required, we have to
4440 peel VF iterations. */
4441 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4442 peel_iters_epilogue = assumed_vf;
4443 return peel_iters_epilogue;
4447 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4449 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4450 int *peel_iters_epilogue,
4451 stmt_vector_for_cost *scalar_cost_vec,
4452 stmt_vector_for_cost *prologue_cost_vec,
4453 stmt_vector_for_cost *epilogue_cost_vec)
4455 int retval = 0;
4457 *peel_iters_epilogue
4458 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4460 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4462 /* If peeled iterations are known but number of scalar loop
4463 iterations are unknown, count a taken branch per peeled loop. */
4464 if (peel_iters_prologue > 0)
4465 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4466 vect_prologue);
4467 if (*peel_iters_epilogue > 0)
4468 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4469 vect_epilogue);
4472 stmt_info_for_cost *si;
4473 int j;
4474 if (peel_iters_prologue)
4475 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4476 retval += record_stmt_cost (prologue_cost_vec,
4477 si->count * peel_iters_prologue,
4478 si->kind, si->stmt_info, si->misalign,
4479 vect_prologue);
4480 if (*peel_iters_epilogue)
4481 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4482 retval += record_stmt_cost (epilogue_cost_vec,
4483 si->count * *peel_iters_epilogue,
4484 si->kind, si->stmt_info, si->misalign,
4485 vect_epilogue);
4487 return retval;
4490 /* Function vect_estimate_min_profitable_iters
4492 Return the number of iterations required for the vector version of the
4493 loop to be profitable relative to the cost of the scalar version of the
4494 loop.
4496 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4497 of iterations for vectorization. -1 value means loop vectorization
4498 is not profitable. This returned value may be used for dynamic
4499 profitability check.
4501 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4502 for static check against estimated number of iterations. */
4504 static void
4505 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4506 int *ret_min_profitable_niters,
4507 int *ret_min_profitable_estimate,
4508 unsigned *suggested_unroll_factor)
4510 int min_profitable_iters;
4511 int min_profitable_estimate;
4512 int peel_iters_prologue;
4513 int peel_iters_epilogue;
4514 unsigned vec_inside_cost = 0;
4515 int vec_outside_cost = 0;
4516 unsigned vec_prologue_cost = 0;
4517 unsigned vec_epilogue_cost = 0;
4518 int scalar_single_iter_cost = 0;
4519 int scalar_outside_cost = 0;
4520 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4521 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4522 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4524 /* Cost model disabled. */
4525 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4527 if (dump_enabled_p ())
4528 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4529 *ret_min_profitable_niters = 0;
4530 *ret_min_profitable_estimate = 0;
4531 return;
4534 /* Requires loop versioning tests to handle misalignment. */
4535 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4537 /* FIXME: Make cost depend on complexity of individual check. */
4538 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4539 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4540 if (dump_enabled_p ())
4541 dump_printf (MSG_NOTE,
4542 "cost model: Adding cost of checks for loop "
4543 "versioning to treat misalignment.\n");
4546 /* Requires loop versioning with alias checks. */
4547 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4549 /* FIXME: Make cost depend on complexity of individual check. */
4550 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4551 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4552 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4553 if (len)
4554 /* Count LEN - 1 ANDs and LEN comparisons. */
4555 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4556 scalar_stmt, vect_prologue);
4557 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4558 if (len)
4560 /* Count LEN - 1 ANDs and LEN comparisons. */
4561 unsigned int nstmts = len * 2 - 1;
4562 /* +1 for each bias that needs adding. */
4563 for (unsigned int i = 0; i < len; ++i)
4564 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4565 nstmts += 1;
4566 (void) add_stmt_cost (target_cost_data, nstmts,
4567 scalar_stmt, vect_prologue);
4569 if (dump_enabled_p ())
4570 dump_printf (MSG_NOTE,
4571 "cost model: Adding cost of checks for loop "
4572 "versioning aliasing.\n");
4575 /* Requires loop versioning with niter checks. */
4576 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4578 /* FIXME: Make cost depend on complexity of individual check. */
4579 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4580 NULL, NULL, NULL_TREE, 0, vect_prologue);
4581 if (dump_enabled_p ())
4582 dump_printf (MSG_NOTE,
4583 "cost model: Adding cost of checks for loop "
4584 "versioning niters.\n");
4587 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4588 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4589 vect_prologue);
4591 /* Count statements in scalar loop. Using this as scalar cost for a single
4592 iteration for now.
4594 TODO: Add outer loop support.
4596 TODO: Consider assigning different costs to different scalar
4597 statements. */
4599 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4601 /* Add additional cost for the peeled instructions in prologue and epilogue
4602 loop. (For fully-masked loops there will be no peeling.)
4604 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4605 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4607 TODO: Build an expression that represents peel_iters for prologue and
4608 epilogue to be used in a run-time test. */
4610 bool prologue_need_br_taken_cost = false;
4611 bool prologue_need_br_not_taken_cost = false;
4613 /* Calculate peel_iters_prologue. */
4614 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4615 peel_iters_prologue = 0;
4616 else if (npeel < 0)
4618 peel_iters_prologue = assumed_vf / 2;
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_NOTE, "cost model: "
4621 "prologue peel iters set to vf/2.\n");
4623 /* If peeled iterations are unknown, count a taken branch and a not taken
4624 branch per peeled loop. Even if scalar loop iterations are known,
4625 vector iterations are not known since peeled prologue iterations are
4626 not known. Hence guards remain the same. */
4627 prologue_need_br_taken_cost = true;
4628 prologue_need_br_not_taken_cost = true;
4630 else
4632 peel_iters_prologue = npeel;
4633 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4634 /* If peeled iterations are known but number of scalar loop
4635 iterations are unknown, count a taken branch per peeled loop. */
4636 prologue_need_br_taken_cost = true;
4639 bool epilogue_need_br_taken_cost = false;
4640 bool epilogue_need_br_not_taken_cost = false;
4642 /* Calculate peel_iters_epilogue. */
4643 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4644 /* We need to peel exactly one iteration for gaps. */
4645 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4646 else if (npeel < 0)
4648 /* If peeling for alignment is unknown, loop bound of main loop
4649 becomes unknown. */
4650 peel_iters_epilogue = assumed_vf / 2;
4651 if (dump_enabled_p ())
4652 dump_printf (MSG_NOTE, "cost model: "
4653 "epilogue peel iters set to vf/2 because "
4654 "peeling for alignment is unknown.\n");
4656 /* See the same reason above in peel_iters_prologue calculation. */
4657 epilogue_need_br_taken_cost = true;
4658 epilogue_need_br_not_taken_cost = true;
4660 else
4662 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4663 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4664 /* If peeled iterations are known but number of scalar loop
4665 iterations are unknown, count a taken branch per peeled loop. */
4666 epilogue_need_br_taken_cost = true;
4669 stmt_info_for_cost *si;
4670 int j;
4671 /* Add costs associated with peel_iters_prologue. */
4672 if (peel_iters_prologue)
4673 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4675 (void) add_stmt_cost (target_cost_data,
4676 si->count * peel_iters_prologue, si->kind,
4677 si->stmt_info, si->node, si->vectype,
4678 si->misalign, vect_prologue);
4681 /* Add costs associated with peel_iters_epilogue. */
4682 if (peel_iters_epilogue)
4683 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4685 (void) add_stmt_cost (target_cost_data,
4686 si->count * peel_iters_epilogue, si->kind,
4687 si->stmt_info, si->node, si->vectype,
4688 si->misalign, vect_epilogue);
4691 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4693 if (prologue_need_br_taken_cost)
4694 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4695 vect_prologue);
4697 if (prologue_need_br_not_taken_cost)
4698 (void) add_stmt_cost (target_cost_data, 1,
4699 cond_branch_not_taken, vect_prologue);
4701 if (epilogue_need_br_taken_cost)
4702 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4703 vect_epilogue);
4705 if (epilogue_need_br_not_taken_cost)
4706 (void) add_stmt_cost (target_cost_data, 1,
4707 cond_branch_not_taken, vect_epilogue);
4709 /* Take care of special costs for rgroup controls of partial vectors. */
4710 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4711 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4712 == vect_partial_vectors_avx512))
4714 /* Calculate how many masks we need to generate. */
4715 unsigned int num_masks = 0;
4716 bool need_saturation = false;
4717 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4718 if (rgm.type)
4720 unsigned nvectors = rgm.factor;
4721 num_masks += nvectors;
4722 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4723 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4724 need_saturation = true;
4727 /* ??? The target isn't able to identify the costs below as
4728 producing masks so it cannot penaltize cases where we'd run
4729 out of mask registers for example. */
4731 /* ??? We are also failing to account for smaller vector masks
4732 we generate by splitting larger masks in vect_get_loop_mask. */
4734 /* In the worst case, we need to generate each mask in the prologue
4735 and in the loop body. We need one splat per group and one
4736 compare per mask.
4738 Sometimes the prologue mask will fold to a constant,
4739 so the actual prologue cost might be smaller. However, it's
4740 simpler and safer to use the worst-case cost; if this ends up
4741 being the tie-breaker between vectorizing or not, then it's
4742 probably better not to vectorize. */
4743 (void) add_stmt_cost (target_cost_data,
4744 num_masks
4745 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4746 vector_stmt, NULL, NULL, NULL_TREE, 0,
4747 vect_prologue);
4748 (void) add_stmt_cost (target_cost_data,
4749 num_masks
4750 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4751 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4753 /* When we need saturation we need it both in the prologue and
4754 the epilogue. */
4755 if (need_saturation)
4757 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4758 NULL, NULL, NULL_TREE, 0, vect_prologue);
4759 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4760 NULL, NULL, NULL_TREE, 0, vect_body);
4763 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4764 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4765 == vect_partial_vectors_while_ult))
4767 /* Calculate how many masks we need to generate. */
4768 unsigned int num_masks = 0;
4769 rgroup_controls *rgm;
4770 unsigned int num_vectors_m1;
4771 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4772 num_vectors_m1, rgm)
4773 if (rgm->type)
4774 num_masks += num_vectors_m1 + 1;
4775 gcc_assert (num_masks > 0);
4777 /* In the worst case, we need to generate each mask in the prologue
4778 and in the loop body. One of the loop body mask instructions
4779 replaces the comparison in the scalar loop, and since we don't
4780 count the scalar comparison against the scalar body, we shouldn't
4781 count that vector instruction against the vector body either.
4783 Sometimes we can use unpacks instead of generating prologue
4784 masks and sometimes the prologue mask will fold to a constant,
4785 so the actual prologue cost might be smaller. However, it's
4786 simpler and safer to use the worst-case cost; if this ends up
4787 being the tie-breaker between vectorizing or not, then it's
4788 probably better not to vectorize. */
4789 (void) add_stmt_cost (target_cost_data, num_masks,
4790 vector_stmt, NULL, NULL, NULL_TREE, 0,
4791 vect_prologue);
4792 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4793 vector_stmt, NULL, NULL, NULL_TREE, 0,
4794 vect_body);
4796 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4798 /* Referring to the functions vect_set_loop_condition_partial_vectors
4799 and vect_set_loop_controls_directly, we need to generate each
4800 length in the prologue and in the loop body if required. Although
4801 there are some possible optimizations, we consider the worst case
4802 here. */
4804 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4805 signed char partial_load_store_bias
4806 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4807 bool need_iterate_p
4808 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4809 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4811 /* Calculate how many statements to be added. */
4812 unsigned int prologue_stmts = 0;
4813 unsigned int body_stmts = 0;
4815 rgroup_controls *rgc;
4816 unsigned int num_vectors_m1;
4817 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4818 if (rgc->type)
4820 /* May need one SHIFT for nitems_total computation. */
4821 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4822 if (nitems != 1 && !niters_known_p)
4823 prologue_stmts += 1;
4825 /* May need one MAX and one MINUS for wrap around. */
4826 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4827 prologue_stmts += 2;
4829 /* Need one MAX and one MINUS for each batch limit excepting for
4830 the 1st one. */
4831 prologue_stmts += num_vectors_m1 * 2;
4833 unsigned int num_vectors = num_vectors_m1 + 1;
4835 /* Need to set up lengths in prologue, only one MIN required
4836 for each since start index is zero. */
4837 prologue_stmts += num_vectors;
4839 /* If we have a non-zero partial load bias, we need one PLUS
4840 to adjust the load length. */
4841 if (partial_load_store_bias != 0)
4842 body_stmts += 1;
4844 unsigned int length_update_cost = 0;
4845 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4846 /* For decrement IV style, Each only need a single SELECT_VL
4847 or MIN since beginning to calculate the number of elements
4848 need to be processed in current iteration. */
4849 length_update_cost = 1;
4850 else
4851 /* For increment IV stype, Each may need two MINs and one MINUS to
4852 update lengths in body for next iteration. */
4853 length_update_cost = 3;
4855 if (need_iterate_p)
4856 body_stmts += length_update_cost * num_vectors;
4859 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4860 scalar_stmt, vect_prologue);
4861 (void) add_stmt_cost (target_cost_data, body_stmts,
4862 scalar_stmt, vect_body);
4865 /* FORNOW: The scalar outside cost is incremented in one of the
4866 following ways:
4868 1. The vectorizer checks for alignment and aliasing and generates
4869 a condition that allows dynamic vectorization. A cost model
4870 check is ANDED with the versioning condition. Hence scalar code
4871 path now has the added cost of the versioning check.
4873 if (cost > th & versioning_check)
4874 jmp to vector code
4876 Hence run-time scalar is incremented by not-taken branch cost.
4878 2. The vectorizer then checks if a prologue is required. If the
4879 cost model check was not done before during versioning, it has to
4880 be done before the prologue check.
4882 if (cost <= th)
4883 prologue = scalar_iters
4884 if (prologue == 0)
4885 jmp to vector code
4886 else
4887 execute prologue
4888 if (prologue == num_iters)
4889 go to exit
4891 Hence the run-time scalar cost is incremented by a taken branch,
4892 plus a not-taken branch, plus a taken branch cost.
4894 3. The vectorizer then checks if an epilogue is required. If the
4895 cost model check was not done before during prologue check, it
4896 has to be done with the epilogue check.
4898 if (prologue == 0)
4899 jmp to vector code
4900 else
4901 execute prologue
4902 if (prologue == num_iters)
4903 go to exit
4904 vector code:
4905 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4906 jmp to epilogue
4908 Hence the run-time scalar cost should be incremented by 2 taken
4909 branches.
4911 TODO: The back end may reorder the BBS's differently and reverse
4912 conditions/branch directions. Change the estimates below to
4913 something more reasonable. */
4915 /* If the number of iterations is known and we do not do versioning, we can
4916 decide whether to vectorize at compile time. Hence the scalar version
4917 do not carry cost model guard costs. */
4918 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4919 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4921 /* Cost model check occurs at versioning. */
4922 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4923 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4924 else
4926 /* Cost model check occurs at prologue generation. */
4927 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4928 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4929 + vect_get_stmt_cost (cond_branch_not_taken);
4930 /* Cost model check occurs at epilogue generation. */
4931 else
4932 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4936 /* Complete the target-specific cost calculations. */
4937 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4938 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4939 suggested_unroll_factor);
4941 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4942 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4943 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4944 *suggested_unroll_factor,
4945 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4947 if (dump_enabled_p ())
4948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949 "can't unroll as unrolled vectorization factor larger"
4950 " than maximum vectorization factor: "
4951 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4952 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4953 *suggested_unroll_factor = 1;
4956 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4958 if (dump_enabled_p ())
4960 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4961 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4962 vec_inside_cost);
4963 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4964 vec_prologue_cost);
4965 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4966 vec_epilogue_cost);
4967 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4968 scalar_single_iter_cost);
4969 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4970 scalar_outside_cost);
4971 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4972 vec_outside_cost);
4973 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4974 peel_iters_prologue);
4975 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4976 peel_iters_epilogue);
4979 /* Calculate number of iterations required to make the vector version
4980 profitable, relative to the loop bodies only. The following condition
4981 must hold true:
4982 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4983 where
4984 SIC = scalar iteration cost, VIC = vector iteration cost,
4985 VOC = vector outside cost, VF = vectorization factor,
4986 NPEEL = prologue iterations + epilogue iterations,
4987 SOC = scalar outside cost for run time cost model check. */
4989 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4990 - vec_inside_cost);
4991 if (saving_per_viter <= 0)
4993 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4994 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4995 "vectorization did not happen for a simd loop");
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999 "cost model: the vector iteration cost = %d "
5000 "divided by the scalar iteration cost = %d "
5001 "is greater or equal to the vectorization factor = %d"
5002 ".\n",
5003 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5004 *ret_min_profitable_niters = -1;
5005 *ret_min_profitable_estimate = -1;
5006 return;
5009 /* ??? The "if" arm is written to handle all cases; see below for what
5010 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5011 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5013 /* Rewriting the condition above in terms of the number of
5014 vector iterations (vniters) rather than the number of
5015 scalar iterations (niters) gives:
5017 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5019 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5021 For integer N, X and Y when X > 0:
5023 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5024 int outside_overhead = (vec_outside_cost
5025 - scalar_single_iter_cost * peel_iters_prologue
5026 - scalar_single_iter_cost * peel_iters_epilogue
5027 - scalar_outside_cost);
5028 /* We're only interested in cases that require at least one
5029 vector iteration. */
5030 int min_vec_niters = 1;
5031 if (outside_overhead > 0)
5032 min_vec_niters = outside_overhead / saving_per_viter + 1;
5034 if (dump_enabled_p ())
5035 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5036 min_vec_niters);
5038 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5040 /* Now that we know the minimum number of vector iterations,
5041 find the minimum niters for which the scalar cost is larger:
5043 SIC * niters > VIC * vniters + VOC - SOC
5045 We know that the minimum niters is no more than
5046 vniters * VF + NPEEL, but it might be (and often is) less
5047 than that if a partial vector iteration is cheaper than the
5048 equivalent scalar code. */
5049 int threshold = (vec_inside_cost * min_vec_niters
5050 + vec_outside_cost
5051 - scalar_outside_cost);
5052 if (threshold <= 0)
5053 min_profitable_iters = 1;
5054 else
5055 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5057 else
5058 /* Convert the number of vector iterations into a number of
5059 scalar iterations. */
5060 min_profitable_iters = (min_vec_niters * assumed_vf
5061 + peel_iters_prologue
5062 + peel_iters_epilogue);
5064 else
5066 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5067 * assumed_vf
5068 - vec_inside_cost * peel_iters_prologue
5069 - vec_inside_cost * peel_iters_epilogue);
5070 if (min_profitable_iters <= 0)
5071 min_profitable_iters = 0;
5072 else
5074 min_profitable_iters /= saving_per_viter;
5076 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5077 <= (((int) vec_inside_cost * min_profitable_iters)
5078 + (((int) vec_outside_cost - scalar_outside_cost)
5079 * assumed_vf)))
5080 min_profitable_iters++;
5084 if (dump_enabled_p ())
5085 dump_printf (MSG_NOTE,
5086 " Calculated minimum iters for profitability: %d\n",
5087 min_profitable_iters);
5089 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5090 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5091 /* We want the vectorized loop to execute at least once. */
5092 min_profitable_iters = assumed_vf + peel_iters_prologue;
5093 else if (min_profitable_iters < peel_iters_prologue)
5094 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5095 vectorized loop executes at least once. */
5096 min_profitable_iters = peel_iters_prologue;
5098 if (dump_enabled_p ())
5099 dump_printf_loc (MSG_NOTE, vect_location,
5100 " Runtime profitability threshold = %d\n",
5101 min_profitable_iters);
5103 *ret_min_profitable_niters = min_profitable_iters;
5105 /* Calculate number of iterations required to make the vector version
5106 profitable, relative to the loop bodies only.
5108 Non-vectorized variant is SIC * niters and it must win over vector
5109 variant on the expected loop trip count. The following condition must hold true:
5110 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5112 if (vec_outside_cost <= 0)
5113 min_profitable_estimate = 0;
5114 /* ??? This "else if" arm is written to handle all cases; see below for
5115 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5116 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5118 /* This is a repeat of the code above, but with + SOC rather
5119 than - SOC. */
5120 int outside_overhead = (vec_outside_cost
5121 - scalar_single_iter_cost * peel_iters_prologue
5122 - scalar_single_iter_cost * peel_iters_epilogue
5123 + scalar_outside_cost);
5124 int min_vec_niters = 1;
5125 if (outside_overhead > 0)
5126 min_vec_niters = outside_overhead / saving_per_viter + 1;
5128 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5130 int threshold = (vec_inside_cost * min_vec_niters
5131 + vec_outside_cost
5132 + scalar_outside_cost);
5133 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5135 else
5136 min_profitable_estimate = (min_vec_niters * assumed_vf
5137 + peel_iters_prologue
5138 + peel_iters_epilogue);
5140 else
5142 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5143 * assumed_vf
5144 - vec_inside_cost * peel_iters_prologue
5145 - vec_inside_cost * peel_iters_epilogue)
5146 / ((scalar_single_iter_cost * assumed_vf)
5147 - vec_inside_cost);
5149 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5150 if (dump_enabled_p ())
5151 dump_printf_loc (MSG_NOTE, vect_location,
5152 " Static estimate profitability threshold = %d\n",
5153 min_profitable_estimate);
5155 *ret_min_profitable_estimate = min_profitable_estimate;
5158 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5159 vector elements (not bits) for a vector with NELT elements. */
5160 static void
5161 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5162 vec_perm_builder *sel)
5164 /* The encoding is a single stepped pattern. Any wrap-around is handled
5165 by vec_perm_indices. */
5166 sel->new_vector (nelt, 1, 3);
5167 for (unsigned int i = 0; i < 3; i++)
5168 sel->quick_push (i + offset);
5171 /* Checks whether the target supports whole-vector shifts for vectors of mode
5172 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5173 it supports vec_perm_const with masks for all necessary shift amounts. */
5174 static bool
5175 have_whole_vector_shift (machine_mode mode)
5177 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5178 return true;
5180 /* Variable-length vectors should be handled via the optab. */
5181 unsigned int nelt;
5182 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5183 return false;
5185 vec_perm_builder sel;
5186 vec_perm_indices indices;
5187 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5189 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5190 indices.new_vector (sel, 2, nelt);
5191 if (!can_vec_perm_const_p (mode, mode, indices, false))
5192 return false;
5194 return true;
5197 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5198 multiplication operands have differing signs and (b) we intend
5199 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5200 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5202 static bool
5203 vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5205 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5206 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5207 return false;
5209 tree rhs1 = gimple_assign_rhs1 (assign);
5210 tree rhs2 = gimple_assign_rhs2 (assign);
5211 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5212 return false;
5214 gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5215 return !directly_supported_p (DOT_PROD_EXPR,
5216 STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5217 optab_vector_mixed_sign);
5220 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5221 functions. Design better to avoid maintenance issues. */
5223 /* Function vect_model_reduction_cost.
5225 Models cost for a reduction operation, including the vector ops
5226 generated within the strip-mine loop in some cases, the initial
5227 definition before the loop, and the epilogue code that must be generated. */
5229 static void
5230 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5231 stmt_vec_info stmt_info, internal_fn reduc_fn,
5232 vect_reduction_type reduction_type,
5233 int ncopies, stmt_vector_for_cost *cost_vec)
5235 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5236 tree vectype;
5237 machine_mode mode;
5238 class loop *loop = NULL;
5240 if (loop_vinfo)
5241 loop = LOOP_VINFO_LOOP (loop_vinfo);
5243 /* Condition reductions generate two reductions in the loop. */
5244 if (reduction_type == COND_REDUCTION)
5245 ncopies *= 2;
5247 vectype = STMT_VINFO_VECTYPE (stmt_info);
5248 mode = TYPE_MODE (vectype);
5249 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5251 gimple_match_op op;
5252 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5253 gcc_unreachable ();
5255 if (reduction_type == EXTRACT_LAST_REDUCTION)
5256 /* No extra instructions are needed in the prologue. The loop body
5257 operations are costed in vectorizable_condition. */
5258 inside_cost = 0;
5259 else if (reduction_type == FOLD_LEFT_REDUCTION)
5261 /* No extra instructions needed in the prologue. */
5262 prologue_cost = 0;
5264 if (reduc_fn != IFN_LAST)
5265 /* Count one reduction-like operation per vector. */
5266 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5267 stmt_info, 0, vect_body);
5268 else
5270 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5271 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5272 inside_cost = record_stmt_cost (cost_vec, nelements,
5273 vec_to_scalar, stmt_info, 0,
5274 vect_body);
5275 inside_cost += record_stmt_cost (cost_vec, nelements,
5276 scalar_stmt, stmt_info, 0,
5277 vect_body);
5280 else
5282 /* Add in the cost of the initial definitions. */
5283 int prologue_stmts;
5284 if (reduction_type == COND_REDUCTION)
5285 /* For cond reductions we have four vectors: initial index, step,
5286 initial result of the data reduction, initial value of the index
5287 reduction. */
5288 prologue_stmts = 4;
5289 else
5290 /* We need the initial reduction value. */
5291 prologue_stmts = 1;
5292 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5293 scalar_to_vec, stmt_info, 0,
5294 vect_prologue);
5297 /* Determine cost of epilogue code.
5299 We have a reduction operator that will reduce the vector in one statement.
5300 Also requires scalar extract. */
5302 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5304 if (reduc_fn != IFN_LAST)
5306 if (reduction_type == COND_REDUCTION)
5308 /* An EQ stmt and an COND_EXPR stmt. */
5309 epilogue_cost += record_stmt_cost (cost_vec, 2,
5310 vector_stmt, stmt_info, 0,
5311 vect_epilogue);
5312 /* Reduction of the max index and a reduction of the found
5313 values. */
5314 epilogue_cost += record_stmt_cost (cost_vec, 2,
5315 vec_to_scalar, stmt_info, 0,
5316 vect_epilogue);
5317 /* A broadcast of the max value. */
5318 epilogue_cost += record_stmt_cost (cost_vec, 1,
5319 scalar_to_vec, stmt_info, 0,
5320 vect_epilogue);
5322 else
5324 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5325 stmt_info, 0, vect_epilogue);
5326 epilogue_cost += record_stmt_cost (cost_vec, 1,
5327 vec_to_scalar, stmt_info, 0,
5328 vect_epilogue);
5331 else if (reduction_type == COND_REDUCTION)
5333 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5334 /* Extraction of scalar elements. */
5335 epilogue_cost += record_stmt_cost (cost_vec,
5336 2 * estimated_nunits,
5337 vec_to_scalar, stmt_info, 0,
5338 vect_epilogue);
5339 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5340 epilogue_cost += record_stmt_cost (cost_vec,
5341 2 * estimated_nunits - 3,
5342 scalar_stmt, stmt_info, 0,
5343 vect_epilogue);
5345 else if (reduction_type == EXTRACT_LAST_REDUCTION
5346 || reduction_type == FOLD_LEFT_REDUCTION)
5347 /* No extra instructions need in the epilogue. */
5349 else
5351 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5352 tree bitsize = TYPE_SIZE (op.type);
5353 int element_bitsize = tree_to_uhwi (bitsize);
5354 int nelements = vec_size_in_bits / element_bitsize;
5356 if (op.code == COND_EXPR)
5357 op.code = MAX_EXPR;
5359 /* We have a whole vector shift available. */
5360 if (VECTOR_MODE_P (mode)
5361 && directly_supported_p (op.code, vectype)
5362 && have_whole_vector_shift (mode))
5364 /* Final reduction via vector shifts and the reduction operator.
5365 Also requires scalar extract. */
5366 epilogue_cost += record_stmt_cost (cost_vec,
5367 exact_log2 (nelements) * 2,
5368 vector_stmt, stmt_info, 0,
5369 vect_epilogue);
5370 epilogue_cost += record_stmt_cost (cost_vec, 1,
5371 vec_to_scalar, stmt_info, 0,
5372 vect_epilogue);
5374 else
5375 /* Use extracts and reduction op for final reduction. For N
5376 elements, we have N extracts and N-1 reduction ops. */
5377 epilogue_cost += record_stmt_cost (cost_vec,
5378 nelements + nelements - 1,
5379 vector_stmt, stmt_info, 0,
5380 vect_epilogue);
5384 if (dump_enabled_p ())
5385 dump_printf (MSG_NOTE,
5386 "vect_model_reduction_cost: inside_cost = %d, "
5387 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5388 prologue_cost, epilogue_cost);
5391 /* SEQ is a sequence of instructions that initialize the reduction
5392 described by REDUC_INFO. Emit them in the appropriate place. */
5394 static void
5395 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5396 stmt_vec_info reduc_info, gimple *seq)
5398 if (reduc_info->reused_accumulator)
5400 /* When reusing an accumulator from the main loop, we only need
5401 initialization instructions if the main loop can be skipped.
5402 In that case, emit the initialization instructions at the end
5403 of the guard block that does the skip. */
5404 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5405 gcc_assert (skip_edge);
5406 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5407 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5409 else
5411 /* The normal case: emit the initialization instructions on the
5412 preheader edge. */
5413 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5414 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5418 /* Function get_initial_def_for_reduction
5420 Input:
5421 REDUC_INFO - the info_for_reduction
5422 INIT_VAL - the initial value of the reduction variable
5423 NEUTRAL_OP - a value that has no effect on the reduction, as per
5424 neutral_op_for_reduction
5426 Output:
5427 Return a vector variable, initialized according to the operation that
5428 STMT_VINFO performs. This vector will be used as the initial value
5429 of the vector of partial results.
5431 The value we need is a vector in which element 0 has value INIT_VAL
5432 and every other element has value NEUTRAL_OP. */
5434 static tree
5435 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5436 stmt_vec_info reduc_info,
5437 tree init_val, tree neutral_op)
5439 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5440 tree scalar_type = TREE_TYPE (init_val);
5441 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5442 tree init_def;
5443 gimple_seq stmts = NULL;
5445 gcc_assert (vectype);
5447 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5448 || SCALAR_FLOAT_TYPE_P (scalar_type));
5450 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5451 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5453 if (operand_equal_p (init_val, neutral_op))
5455 /* If both elements are equal then the vector described above is
5456 just a splat. */
5457 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5458 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5460 else
5462 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5463 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5464 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5466 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5467 element 0. */
5468 init_def = gimple_build_vector_from_val (&stmts, vectype,
5469 neutral_op);
5470 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5471 vectype, init_def, init_val);
5473 else
5475 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5476 tree_vector_builder elts (vectype, 1, 2);
5477 elts.quick_push (init_val);
5478 elts.quick_push (neutral_op);
5479 init_def = gimple_build_vector (&stmts, &elts);
5483 if (stmts)
5484 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5485 return init_def;
5488 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5489 which performs a reduction involving GROUP_SIZE scalar statements.
5490 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5491 is nonnull, introducing extra elements of that value will not change the
5492 result. */
5494 static void
5495 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5496 stmt_vec_info reduc_info,
5497 vec<tree> *vec_oprnds,
5498 unsigned int number_of_vectors,
5499 unsigned int group_size, tree neutral_op)
5501 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5502 unsigned HOST_WIDE_INT nunits;
5503 unsigned j, number_of_places_left_in_vector;
5504 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5505 unsigned int i;
5507 gcc_assert (group_size == initial_values.length () || neutral_op);
5509 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5510 created vectors. It is greater than 1 if unrolling is performed.
5512 For example, we have two scalar operands, s1 and s2 (e.g., group of
5513 strided accesses of size two), while NUNITS is four (i.e., four scalars
5514 of this type can be packed in a vector). The output vector will contain
5515 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5516 will be 2).
5518 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5519 vectors containing the operands.
5521 For example, NUNITS is four as before, and the group size is 8
5522 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5523 {s5, s6, s7, s8}. */
5525 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5526 nunits = group_size;
5528 number_of_places_left_in_vector = nunits;
5529 bool constant_p = true;
5530 tree_vector_builder elts (vector_type, nunits, 1);
5531 elts.quick_grow (nunits);
5532 gimple_seq ctor_seq = NULL;
5533 if (neutral_op
5534 && !useless_type_conversion_p (TREE_TYPE (vector_type),
5535 TREE_TYPE (neutral_op)))
5536 neutral_op = gimple_convert (&ctor_seq,
5537 TREE_TYPE (vector_type),
5538 neutral_op);
5539 for (j = 0; j < nunits * number_of_vectors; ++j)
5541 tree op;
5542 i = j % group_size;
5544 /* Get the def before the loop. In reduction chain we have only
5545 one initial value. Else we have as many as PHIs in the group. */
5546 if (i >= initial_values.length () || (j > i && neutral_op))
5547 op = neutral_op;
5548 else
5550 if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5551 TREE_TYPE (initial_values[i])))
5552 initial_values[i] = gimple_convert (&ctor_seq,
5553 TREE_TYPE (vector_type),
5554 initial_values[i]);
5555 op = initial_values[i];
5558 /* Create 'vect_ = {op0,op1,...,opn}'. */
5559 number_of_places_left_in_vector--;
5560 elts[nunits - number_of_places_left_in_vector - 1] = op;
5561 if (!CONSTANT_CLASS_P (op))
5562 constant_p = false;
5564 if (number_of_places_left_in_vector == 0)
5566 tree init;
5567 if (constant_p && !neutral_op
5568 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5569 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5570 /* Build the vector directly from ELTS. */
5571 init = gimple_build_vector (&ctor_seq, &elts);
5572 else if (neutral_op)
5574 /* Build a vector of the neutral value and shift the
5575 other elements into place. */
5576 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5577 neutral_op);
5578 int k = nunits;
5579 while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5580 k -= 1;
5581 while (k > 0)
5583 k -= 1;
5584 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5585 vector_type, init, elts[k]);
5588 else
5590 /* First time round, duplicate ELTS to fill the
5591 required number of vectors. */
5592 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5593 elts, number_of_vectors, *vec_oprnds);
5594 break;
5596 vec_oprnds->quick_push (init);
5598 number_of_places_left_in_vector = nunits;
5599 elts.new_vector (vector_type, nunits, 1);
5600 elts.quick_grow (nunits);
5601 constant_p = true;
5604 if (ctor_seq != NULL)
5605 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5608 /* For a statement STMT_INFO taking part in a reduction operation return
5609 the stmt_vec_info the meta information is stored on. */
5611 stmt_vec_info
5612 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5614 stmt_info = vect_orig_stmt (stmt_info);
5615 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5616 if (!is_a <gphi *> (stmt_info->stmt)
5617 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5618 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5619 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5620 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5622 if (gimple_phi_num_args (phi) == 1)
5623 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5625 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5627 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5628 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5629 stmt_info = info;
5631 return stmt_info;
5634 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5635 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5636 return false. */
5638 static bool
5639 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5640 stmt_vec_info reduc_info)
5642 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5643 if (!main_loop_vinfo)
5644 return false;
5646 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5647 return false;
5649 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5650 auto_vec<tree, 16> main_loop_results (num_phis);
5651 auto_vec<tree, 16> initial_values (num_phis);
5652 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5654 /* The epilogue loop can be entered either from the main loop or
5655 from an earlier guard block. */
5656 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5657 for (tree incoming_value : reduc_info->reduc_initial_values)
5659 /* Look for:
5661 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5662 INITIAL_VALUE(guard block)>. */
5663 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5665 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5666 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5668 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5669 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5671 main_loop_results.quick_push (from_main_loop);
5672 initial_values.quick_push (from_skip);
5675 else
5676 /* The main loop dominates the epilogue loop. */
5677 main_loop_results.splice (reduc_info->reduc_initial_values);
5679 /* See if the main loop has the kind of accumulator we need. */
5680 vect_reusable_accumulator *accumulator
5681 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5682 if (!accumulator
5683 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5684 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5685 accumulator->reduc_info->reduc_scalar_results.begin ()))
5686 return false;
5688 /* Handle the case where we can reduce wider vectors to narrower ones. */
5689 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5690 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5691 unsigned HOST_WIDE_INT m;
5692 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5693 TYPE_VECTOR_SUBPARTS (vectype), &m))
5694 return false;
5695 /* Check the intermediate vector types and operations are available. */
5696 tree prev_vectype = old_vectype;
5697 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5698 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5700 intermediate_nunits = exact_div (intermediate_nunits, 2);
5701 tree intermediate_vectype = get_related_vectype_for_scalar_type
5702 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5703 if (!intermediate_vectype
5704 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5705 intermediate_vectype)
5706 || !can_vec_extract (TYPE_MODE (prev_vectype),
5707 TYPE_MODE (intermediate_vectype)))
5708 return false;
5709 prev_vectype = intermediate_vectype;
5712 /* Non-SLP reductions might apply an adjustment after the reduction
5713 operation, in order to simplify the initialization of the accumulator.
5714 If the epilogue loop carries on from where the main loop left off,
5715 it should apply the same adjustment to the final reduction result.
5717 If the epilogue loop can also be entered directly (rather than via
5718 the main loop), we need to be able to handle that case in the same way,
5719 with the same adjustment. (In principle we could add a PHI node
5720 to select the correct adjustment, but in practice that shouldn't be
5721 necessary.) */
5722 tree main_adjustment
5723 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5724 if (loop_vinfo->main_loop_edge && main_adjustment)
5726 gcc_assert (num_phis == 1);
5727 tree initial_value = initial_values[0];
5728 /* Check that we can use INITIAL_VALUE as the adjustment and
5729 initialize the accumulator with a neutral value instead. */
5730 if (!operand_equal_p (initial_value, main_adjustment))
5731 return false;
5732 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5733 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5734 code, initial_value);
5736 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5737 reduc_info->reduc_initial_values.truncate (0);
5738 reduc_info->reduc_initial_values.splice (initial_values);
5739 reduc_info->reused_accumulator = accumulator;
5740 return true;
5743 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5744 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5746 static tree
5747 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5748 gimple_seq *seq)
5750 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5751 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5752 tree stype = TREE_TYPE (vectype);
5753 tree new_temp = vec_def;
5754 while (nunits > nunits1)
5756 nunits /= 2;
5757 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5758 stype, nunits);
5759 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5761 /* The target has to make sure we support lowpart/highpart
5762 extraction, either via direct vector extract or through
5763 an integer mode punning. */
5764 tree dst1, dst2;
5765 gimple *epilog_stmt;
5766 if (convert_optab_handler (vec_extract_optab,
5767 TYPE_MODE (TREE_TYPE (new_temp)),
5768 TYPE_MODE (vectype1))
5769 != CODE_FOR_nothing)
5771 /* Extract sub-vectors directly once vec_extract becomes
5772 a conversion optab. */
5773 dst1 = make_ssa_name (vectype1);
5774 epilog_stmt
5775 = gimple_build_assign (dst1, BIT_FIELD_REF,
5776 build3 (BIT_FIELD_REF, vectype1,
5777 new_temp, TYPE_SIZE (vectype1),
5778 bitsize_int (0)));
5779 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5780 dst2 = make_ssa_name (vectype1);
5781 epilog_stmt
5782 = gimple_build_assign (dst2, BIT_FIELD_REF,
5783 build3 (BIT_FIELD_REF, vectype1,
5784 new_temp, TYPE_SIZE (vectype1),
5785 bitsize_int (bitsize)));
5786 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5788 else
5790 /* Extract via punning to appropriately sized integer mode
5791 vector. */
5792 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5793 tree etype = build_vector_type (eltype, 2);
5794 gcc_assert (convert_optab_handler (vec_extract_optab,
5795 TYPE_MODE (etype),
5796 TYPE_MODE (eltype))
5797 != CODE_FOR_nothing);
5798 tree tem = make_ssa_name (etype);
5799 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5800 build1 (VIEW_CONVERT_EXPR,
5801 etype, new_temp));
5802 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5803 new_temp = tem;
5804 tem = make_ssa_name (eltype);
5805 epilog_stmt
5806 = gimple_build_assign (tem, BIT_FIELD_REF,
5807 build3 (BIT_FIELD_REF, eltype,
5808 new_temp, TYPE_SIZE (eltype),
5809 bitsize_int (0)));
5810 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5811 dst1 = make_ssa_name (vectype1);
5812 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5813 build1 (VIEW_CONVERT_EXPR,
5814 vectype1, tem));
5815 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5816 tem = make_ssa_name (eltype);
5817 epilog_stmt
5818 = gimple_build_assign (tem, BIT_FIELD_REF,
5819 build3 (BIT_FIELD_REF, eltype,
5820 new_temp, TYPE_SIZE (eltype),
5821 bitsize_int (bitsize)));
5822 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5823 dst2 = make_ssa_name (vectype1);
5824 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5825 build1 (VIEW_CONVERT_EXPR,
5826 vectype1, tem));
5827 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5830 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5833 return new_temp;
5836 /* Function vect_create_epilog_for_reduction
5838 Create code at the loop-epilog to finalize the result of a reduction
5839 computation.
5841 STMT_INFO is the scalar reduction stmt that is being vectorized.
5842 SLP_NODE is an SLP node containing a group of reduction statements. The
5843 first one in this group is STMT_INFO.
5844 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5845 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5846 (counting from 0)
5847 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5848 exit this edge is always the main loop exit.
5850 This function:
5851 1. Completes the reduction def-use cycles.
5852 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5853 by calling the function specified by REDUC_FN if available, or by
5854 other means (whole-vector shifts or a scalar loop).
5855 The function also creates a new phi node at the loop exit to preserve
5856 loop-closed form, as illustrated below.
5858 The flow at the entry to this function:
5860 loop:
5861 vec_def = phi <vec_init, null> # REDUCTION_PHI
5862 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5863 s_loop = scalar_stmt # (scalar) STMT_INFO
5864 loop_exit:
5865 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5866 use <s_out0>
5867 use <s_out0>
5869 The above is transformed by this function into:
5871 loop:
5872 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5873 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5874 s_loop = scalar_stmt # (scalar) STMT_INFO
5875 loop_exit:
5876 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5877 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5878 v_out2 = reduce <v_out1>
5879 s_out3 = extract_field <v_out2, 0>
5880 s_out4 = adjust_result <s_out3>
5881 use <s_out4>
5882 use <s_out4>
5885 static void
5886 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5887 stmt_vec_info stmt_info,
5888 slp_tree slp_node,
5889 slp_instance slp_node_instance,
5890 edge loop_exit)
5892 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5893 gcc_assert (reduc_info->is_reduc_info);
5894 /* For double reductions we need to get at the inner loop reduction
5895 stmt which has the meta info attached. Our stmt_info is that of the
5896 loop-closed PHI of the inner loop which we remember as
5897 def for the reduction PHI generation. */
5898 bool double_reduc = false;
5899 stmt_vec_info rdef_info = stmt_info;
5900 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5902 double_reduc = true;
5903 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5904 (stmt_info->stmt, 0));
5905 stmt_info = vect_stmt_to_vectorize (stmt_info);
5907 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5908 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5909 tree vectype;
5910 machine_mode mode;
5911 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5912 basic_block exit_bb;
5913 tree scalar_dest;
5914 tree scalar_type;
5915 gimple *new_phi = NULL, *phi = NULL;
5916 gimple_stmt_iterator exit_gsi;
5917 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5918 gimple *epilog_stmt = NULL;
5919 gimple *exit_phi;
5920 tree bitsize;
5921 tree def;
5922 tree orig_name, scalar_result;
5923 imm_use_iterator imm_iter, phi_imm_iter;
5924 use_operand_p use_p, phi_use_p;
5925 gimple *use_stmt;
5926 auto_vec<tree> reduc_inputs;
5927 int j, i;
5928 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5929 unsigned int group_size = 1, k;
5930 /* SLP reduction without reduction chain, e.g.,
5931 # a1 = phi <a2, a0>
5932 # b1 = phi <b2, b0>
5933 a2 = operation (a1)
5934 b2 = operation (b1) */
5935 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5936 bool direct_slp_reduc;
5937 tree induction_index = NULL_TREE;
5939 if (slp_node)
5940 group_size = SLP_TREE_LANES (slp_node);
5942 if (nested_in_vect_loop_p (loop, stmt_info))
5944 outer_loop = loop;
5945 loop = loop->inner;
5946 gcc_assert (double_reduc);
5949 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5950 gcc_assert (vectype);
5951 mode = TYPE_MODE (vectype);
5953 tree induc_val = NULL_TREE;
5954 tree adjustment_def = NULL;
5955 /* Optimize: for induction condition reduction, if we can't use zero
5956 for induc_val, use initial_def. */
5957 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5958 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5959 else if (double_reduc)
5961 else
5962 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5964 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5965 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5966 if (slp_reduc)
5967 /* All statements produce live-out values. */
5968 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5970 unsigned vec_num;
5971 int ncopies;
5972 if (slp_node)
5974 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5975 ncopies = 1;
5977 else
5979 vec_num = 1;
5980 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5983 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5984 which is updated with the current index of the loop for every match of
5985 the original loop's cond_expr (VEC_STMT). This results in a vector
5986 containing the last time the condition passed for that vector lane.
5987 The first match will be a 1 to allow 0 to be used for non-matching
5988 indexes. If there are no matches at all then the vector will be all
5989 zeroes.
5991 PR92772: This algorithm is broken for architectures that support
5992 masked vectors, but do not provide fold_extract_last. */
5993 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5995 auto_vec<std::pair<tree, bool>, 2> ccompares;
5996 if (slp_node)
5998 slp_tree cond_node = slp_node_instance->root;
5999 while (cond_node != slp_node_instance->reduc_phis)
6001 stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
6002 int slp_reduc_idx;
6003 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6005 gimple *vec_stmt
6006 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
6007 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6008 ccompares.safe_push
6009 (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6010 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6011 /* ??? We probably want to have REDUC_IDX on the SLP node?
6012 We have both three and four children COND_EXPR nodes
6013 dependent on whether the comparison is still embedded
6014 as GENERIC. So work backwards. */
6015 slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
6016 + STMT_VINFO_REDUC_IDX (cond_info));
6018 else
6019 slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
6020 cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
6023 else
6025 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6026 cond_info = vect_stmt_to_vectorize (cond_info);
6027 while (cond_info != reduc_info)
6029 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6031 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6032 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6033 ccompares.safe_push
6034 (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6035 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6037 cond_info
6038 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6039 1 + STMT_VINFO_REDUC_IDX
6040 (cond_info)));
6041 cond_info = vect_stmt_to_vectorize (cond_info);
6044 gcc_assert (ccompares.length () != 0);
6046 tree indx_before_incr, indx_after_incr;
6047 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6048 int scalar_precision
6049 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6050 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6051 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6052 (TYPE_MODE (vectype), cr_index_scalar_type,
6053 TYPE_VECTOR_SUBPARTS (vectype));
6055 /* First we create a simple vector induction variable which starts
6056 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6057 vector size (STEP). */
6059 /* Create a {1,2,3,...} vector. */
6060 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6062 /* Create a vector of the step value. */
6063 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6064 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6066 /* Create an induction variable. */
6067 gimple_stmt_iterator incr_gsi;
6068 bool insert_after;
6069 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6070 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071 insert_after, &indx_before_incr, &indx_after_incr);
6073 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074 filled with zeros (VEC_ZERO). */
6076 /* Create a vector of 0s. */
6077 tree zero = build_zero_cst (cr_index_scalar_type);
6078 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6080 /* Create a vector phi node. */
6081 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082 new_phi = create_phi_node (new_phi_tree, loop->header);
6083 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6086 /* Now take the condition from the loops original cond_exprs
6087 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088 every match uses values from the induction variable
6089 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090 (NEW_PHI_TREE).
6091 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092 the new cond_expr (INDEX_COND_EXPR). */
6093 gimple_seq stmts = NULL;
6094 for (int i = ccompares.length () - 1; i != -1; --i)
6096 tree ccompare = ccompares[i].first;
6097 if (ccompares[i].second)
6098 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099 cr_index_vector_type,
6100 ccompare,
6101 indx_before_incr, new_phi_tree);
6102 else
6103 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104 cr_index_vector_type,
6105 ccompare,
6106 new_phi_tree, indx_before_incr);
6108 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6110 /* Update the phi with the vec cond. */
6111 induction_index = new_phi_tree;
6112 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113 loop_latch_edge (loop), UNKNOWN_LOCATION);
6116 /* 2. Create epilog code.
6117 The reduction epilog code operates across the elements of the vector
6118 of partial results computed by the vectorized loop.
6119 The reduction epilog code consists of:
6121 step 1: compute the scalar result in a vector (v_out2)
6122 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123 step 3: adjust the scalar result (s_out3) if needed.
6125 Step 1 can be accomplished using one the following three schemes:
6126 (scheme 1) using reduc_fn, if available.
6127 (scheme 2) using whole-vector shifts, if available.
6128 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129 combined.
6131 The overall epilog code looks like this:
6133 s_out0 = phi <s_loop> # original EXIT_PHI
6134 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6135 v_out2 = reduce <v_out1> # step 1
6136 s_out3 = extract_field <v_out2, 0> # step 2
6137 s_out4 = adjust_result <s_out3> # step 3
6139 (step 3 is optional, and steps 1 and 2 may be combined).
6140 Lastly, the uses of s_out0 are replaced by s_out4. */
6143 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144 v_out1 = phi <VECT_DEF>
6145 Store them in NEW_PHIS. */
6146 if (double_reduc)
6147 loop = outer_loop;
6148 /* We need to reduce values in all exits. */
6149 exit_bb = loop_exit->dest;
6150 exit_gsi = gsi_after_labels (exit_bb);
6151 reduc_inputs.create (slp_node ? vec_num : ncopies);
6152 for (unsigned i = 0; i < vec_num; i++)
6154 gimple_seq stmts = NULL;
6155 if (slp_node)
6156 def = vect_get_slp_vect_def (slp_node, i);
6157 else
6158 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6159 for (j = 0; j < ncopies; j++)
6161 tree new_def = copy_ssa_name (def);
6162 phi = create_phi_node (new_def, exit_bb);
6163 if (j)
6164 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6165 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6166 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6167 else
6169 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6170 SET_PHI_ARG_DEF (phi, k, def);
6172 new_def = gimple_convert (&stmts, vectype, new_def);
6173 reduc_inputs.quick_push (new_def);
6175 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6178 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6179 (i.e. when reduc_fn is not available) and in the final adjustment
6180 code (if needed). Also get the original scalar reduction variable as
6181 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6182 represents a reduction pattern), the tree-code and scalar-def are
6183 taken from the original stmt that the pattern-stmt (STMT) replaces.
6184 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6185 are taken from STMT. */
6187 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6188 if (orig_stmt_info != stmt_info)
6190 /* Reduction pattern */
6191 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6192 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6195 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6196 scalar_type = TREE_TYPE (scalar_dest);
6197 scalar_results.truncate (0);
6198 scalar_results.reserve_exact (group_size);
6199 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6200 bitsize = TYPE_SIZE (scalar_type);
6202 /* True if we should implement SLP_REDUC using native reduction operations
6203 instead of scalar operations. */
6204 direct_slp_reduc = (reduc_fn != IFN_LAST
6205 && slp_reduc
6206 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6208 /* In case of reduction chain, e.g.,
6209 # a1 = phi <a3, a0>
6210 a2 = operation (a1)
6211 a3 = operation (a2),
6213 we may end up with more than one vector result. Here we reduce them
6214 to one vector.
6216 The same is true for a SLP reduction, e.g.,
6217 # a1 = phi <a2, a0>
6218 # b1 = phi <b2, b0>
6219 a2 = operation (a1)
6220 b2 = operation (a2),
6222 where we can end up with more than one vector as well. We can
6223 easily accumulate vectors when the number of vector elements is
6224 a multiple of the SLP group size.
6226 The same is true if we couldn't use a single defuse cycle. */
6227 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6228 || direct_slp_reduc
6229 || (slp_reduc
6230 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6231 || ncopies > 1)
6233 gimple_seq stmts = NULL;
6234 tree single_input = reduc_inputs[0];
6235 for (k = 1; k < reduc_inputs.length (); k++)
6236 single_input = gimple_build (&stmts, code, vectype,
6237 single_input, reduc_inputs[k]);
6238 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6240 reduc_inputs.truncate (0);
6241 reduc_inputs.safe_push (single_input);
6244 tree orig_reduc_input = reduc_inputs[0];
6246 /* If this loop is an epilogue loop that can be skipped after the
6247 main loop, we can only share a reduction operation between the
6248 main loop and the epilogue if we put it at the target of the
6249 skip edge.
6251 We can still reuse accumulators if this check fails. Doing so has
6252 the minor(?) benefit of making the epilogue loop's scalar result
6253 independent of the main loop's scalar result. */
6254 bool unify_with_main_loop_p = false;
6255 if (reduc_info->reused_accumulator
6256 && loop_vinfo->skip_this_loop_edge
6257 && single_succ_p (exit_bb)
6258 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6260 unify_with_main_loop_p = true;
6262 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6263 reduc_inputs[0] = make_ssa_name (vectype);
6264 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6265 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6266 UNKNOWN_LOCATION);
6267 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6268 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6269 exit_gsi = gsi_after_labels (reduc_block);
6272 /* Shouldn't be used beyond this point. */
6273 exit_bb = nullptr;
6275 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6276 && reduc_fn != IFN_LAST)
6278 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6279 various data values where the condition matched and another vector
6280 (INDUCTION_INDEX) containing all the indexes of those matches. We
6281 need to extract the last matching index (which will be the index with
6282 highest value) and use this to index into the data vector.
6283 For the case where there were no matches, the data vector will contain
6284 all default values and the index vector will be all zeros. */
6286 /* Get various versions of the type of the vector of indexes. */
6287 tree index_vec_type = TREE_TYPE (induction_index);
6288 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6289 tree index_scalar_type = TREE_TYPE (index_vec_type);
6290 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6292 /* Get an unsigned integer version of the type of the data vector. */
6293 int scalar_precision
6294 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6295 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6296 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6297 vectype);
6299 /* First we need to create a vector (ZERO_VEC) of zeros and another
6300 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6301 can create using a MAX reduction and then expanding.
6302 In the case where the loop never made any matches, the max index will
6303 be zero. */
6305 /* Vector of {0, 0, 0,...}. */
6306 tree zero_vec = build_zero_cst (vectype);
6308 /* Find maximum value from the vector of found indexes. */
6309 tree max_index = make_ssa_name (index_scalar_type);
6310 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6311 1, induction_index);
6312 gimple_call_set_lhs (max_index_stmt, max_index);
6313 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6315 /* Vector of {max_index, max_index, max_index,...}. */
6316 tree max_index_vec = make_ssa_name (index_vec_type);
6317 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6318 max_index);
6319 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6320 max_index_vec_rhs);
6321 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6323 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6324 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6325 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6326 otherwise. Only one value should match, resulting in a vector
6327 (VEC_COND) with one data value and the rest zeros.
6328 In the case where the loop never made any matches, every index will
6329 match, resulting in a vector with all data values (which will all be
6330 the default value). */
6332 /* Compare the max index vector to the vector of found indexes to find
6333 the position of the max value. */
6334 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6335 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6336 induction_index,
6337 max_index_vec);
6338 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6340 /* Use the compare to choose either values from the data vector or
6341 zero. */
6342 tree vec_cond = make_ssa_name (vectype);
6343 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6344 vec_compare,
6345 reduc_inputs[0],
6346 zero_vec);
6347 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6349 /* Finally we need to extract the data value from the vector (VEC_COND)
6350 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6351 reduction, but because this doesn't exist, we can use a MAX reduction
6352 instead. The data value might be signed or a float so we need to cast
6353 it first.
6354 In the case where the loop never made any matches, the data values are
6355 all identical, and so will reduce down correctly. */
6357 /* Make the matched data values unsigned. */
6358 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6359 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6360 vec_cond);
6361 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6362 VIEW_CONVERT_EXPR,
6363 vec_cond_cast_rhs);
6364 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6366 /* Reduce down to a scalar value. */
6367 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6368 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6369 1, vec_cond_cast);
6370 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6371 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6373 /* Convert the reduced value back to the result type and set as the
6374 result. */
6375 gimple_seq stmts = NULL;
6376 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6377 data_reduc);
6378 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6379 scalar_results.safe_push (new_temp);
6381 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6382 && reduc_fn == IFN_LAST)
6384 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6385 idx = 0;
6386 idx_val = induction_index[0];
6387 val = data_reduc[0];
6388 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6389 if (induction_index[i] > idx_val)
6390 val = data_reduc[i], idx_val = induction_index[i];
6391 return val; */
6393 tree data_eltype = TREE_TYPE (vectype);
6394 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6395 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6396 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6397 /* Enforced by vectorizable_reduction, which ensures we have target
6398 support before allowing a conditional reduction on variable-length
6399 vectors. */
6400 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6401 tree idx_val = NULL_TREE, val = NULL_TREE;
6402 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6404 tree old_idx_val = idx_val;
6405 tree old_val = val;
6406 idx_val = make_ssa_name (idx_eltype);
6407 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6408 build3 (BIT_FIELD_REF, idx_eltype,
6409 induction_index,
6410 bitsize_int (el_size),
6411 bitsize_int (off)));
6412 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6413 val = make_ssa_name (data_eltype);
6414 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6415 build3 (BIT_FIELD_REF,
6416 data_eltype,
6417 reduc_inputs[0],
6418 bitsize_int (el_size),
6419 bitsize_int (off)));
6420 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6421 if (off != 0)
6423 tree new_idx_val = idx_val;
6424 if (off != v_size - el_size)
6426 new_idx_val = make_ssa_name (idx_eltype);
6427 epilog_stmt = gimple_build_assign (new_idx_val,
6428 MAX_EXPR, idx_val,
6429 old_idx_val);
6430 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6432 tree cond = make_ssa_name (boolean_type_node);
6433 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6434 idx_val, old_idx_val);
6435 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6436 tree new_val = make_ssa_name (data_eltype);
6437 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6438 cond, val, old_val);
6439 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6440 idx_val = new_idx_val;
6441 val = new_val;
6444 /* Convert the reduced value back to the result type and set as the
6445 result. */
6446 gimple_seq stmts = NULL;
6447 val = gimple_convert (&stmts, scalar_type, val);
6448 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6449 scalar_results.safe_push (val);
6452 /* 2.3 Create the reduction code, using one of the three schemes described
6453 above. In SLP we simply need to extract all the elements from the
6454 vector (without reducing them), so we use scalar shifts. */
6455 else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6457 tree tmp;
6458 tree vec_elem_type;
6460 /* Case 1: Create:
6461 v_out2 = reduc_expr <v_out1> */
6463 if (dump_enabled_p ())
6464 dump_printf_loc (MSG_NOTE, vect_location,
6465 "Reduce using direct vector reduction.\n");
6467 gimple_seq stmts = NULL;
6468 vec_elem_type = TREE_TYPE (vectype);
6469 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6470 vec_elem_type, reduc_inputs[0]);
6471 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6472 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6474 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6475 && induc_val)
6477 /* Earlier we set the initial value to be a vector if induc_val
6478 values. Check the result and if it is induc_val then replace
6479 with the original initial value, unless induc_val is
6480 the same as initial_def already. */
6481 tree zcompare = make_ssa_name (boolean_type_node);
6482 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6483 new_temp, induc_val);
6484 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6485 tree initial_def = reduc_info->reduc_initial_values[0];
6486 tmp = make_ssa_name (new_scalar_dest);
6487 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6488 initial_def, new_temp);
6489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6490 new_temp = tmp;
6493 scalar_results.safe_push (new_temp);
6495 else if (direct_slp_reduc)
6497 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6498 with the elements for other SLP statements replaced with the
6499 neutral value. We can then do a normal reduction on each vector. */
6501 /* Enforced by vectorizable_reduction. */
6502 gcc_assert (reduc_inputs.length () == 1);
6503 gcc_assert (pow2p_hwi (group_size));
6505 gimple_seq seq = NULL;
6507 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6508 and the same element size as VECTYPE. */
6509 tree index = build_index_vector (vectype, 0, 1);
6510 tree index_type = TREE_TYPE (index);
6511 tree index_elt_type = TREE_TYPE (index_type);
6512 tree mask_type = truth_type_for (index_type);
6514 /* Create a vector that, for each element, identifies which of
6515 the REDUC_GROUP_SIZE results should use it. */
6516 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6517 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6518 build_vector_from_val (index_type, index_mask));
6520 /* Get a neutral vector value. This is simply a splat of the neutral
6521 scalar value if we have one, otherwise the initial scalar value
6522 is itself a neutral value. */
6523 tree vector_identity = NULL_TREE;
6524 tree neutral_op = NULL_TREE;
6525 if (slp_node)
6527 tree initial_value = NULL_TREE;
6528 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6529 initial_value = reduc_info->reduc_initial_values[0];
6530 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6531 initial_value, false);
6533 if (neutral_op)
6534 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6535 neutral_op);
6536 for (unsigned int i = 0; i < group_size; ++i)
6538 /* If there's no univeral neutral value, we can use the
6539 initial scalar value from the original PHI. This is used
6540 for MIN and MAX reduction, for example. */
6541 if (!neutral_op)
6543 tree scalar_value = reduc_info->reduc_initial_values[i];
6544 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6545 scalar_value);
6546 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6547 scalar_value);
6550 /* Calculate the equivalent of:
6552 sel[j] = (index[j] == i);
6554 which selects the elements of REDUC_INPUTS[0] that should
6555 be included in the result. */
6556 tree compare_val = build_int_cst (index_elt_type, i);
6557 compare_val = build_vector_from_val (index_type, compare_val);
6558 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6559 index, compare_val);
6561 /* Calculate the equivalent of:
6563 vec = seq ? reduc_inputs[0] : vector_identity;
6565 VEC is now suitable for a full vector reduction. */
6566 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6567 sel, reduc_inputs[0], vector_identity);
6569 /* Do the reduction and convert it to the appropriate type. */
6570 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6571 TREE_TYPE (vectype), vec);
6572 scalar = gimple_convert (&seq, scalar_type, scalar);
6573 scalar_results.safe_push (scalar);
6575 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6577 else
6579 bool reduce_with_shift;
6580 tree vec_temp;
6582 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6584 /* See if the target wants to do the final (shift) reduction
6585 in a vector mode of smaller size and first reduce upper/lower
6586 halves against each other. */
6587 enum machine_mode mode1 = mode;
6588 tree stype = TREE_TYPE (vectype);
6589 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6590 unsigned nunits1 = nunits;
6591 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6592 && reduc_inputs.length () == 1)
6594 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6595 /* For SLP reductions we have to make sure lanes match up, but
6596 since we're doing individual element final reduction reducing
6597 vector width here is even more important.
6598 ??? We can also separate lanes with permutes, for the common
6599 case of power-of-two group-size odd/even extracts would work. */
6600 if (slp_reduc && nunits != nunits1)
6602 nunits1 = least_common_multiple (nunits1, group_size);
6603 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6606 if (!slp_reduc
6607 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6608 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6610 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6611 stype, nunits1);
6612 reduce_with_shift = have_whole_vector_shift (mode1);
6613 if (!VECTOR_MODE_P (mode1)
6614 || !directly_supported_p (code, vectype1))
6615 reduce_with_shift = false;
6617 /* First reduce the vector to the desired vector size we should
6618 do shift reduction on by combining upper and lower halves. */
6619 gimple_seq stmts = NULL;
6620 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6621 code, &stmts);
6622 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6623 reduc_inputs[0] = new_temp;
6625 if (reduce_with_shift && (!slp_reduc || group_size == 1))
6627 int element_bitsize = tree_to_uhwi (bitsize);
6628 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6629 for variable-length vectors and also requires direct target support
6630 for loop reductions. */
6631 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6632 int nelements = vec_size_in_bits / element_bitsize;
6633 vec_perm_builder sel;
6634 vec_perm_indices indices;
6636 int elt_offset;
6638 tree zero_vec = build_zero_cst (vectype1);
6639 /* Case 2: Create:
6640 for (offset = nelements/2; offset >= 1; offset/=2)
6642 Create: va' = vec_shift <va, offset>
6643 Create: va = vop <va, va'>
6644 } */
6646 tree rhs;
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_NOTE, vect_location,
6650 "Reduce using vector shifts\n");
6652 gimple_seq stmts = NULL;
6653 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6654 for (elt_offset = nelements / 2;
6655 elt_offset >= 1;
6656 elt_offset /= 2)
6658 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6659 indices.new_vector (sel, 2, nelements);
6660 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6661 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6662 new_temp, zero_vec, mask);
6663 new_temp = gimple_build (&stmts, code,
6664 vectype1, new_name, new_temp);
6666 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6668 /* 2.4 Extract the final scalar result. Create:
6669 s_out3 = extract_field <v_out2, bitpos> */
6671 if (dump_enabled_p ())
6672 dump_printf_loc (MSG_NOTE, vect_location,
6673 "extract scalar result\n");
6675 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6676 bitsize, bitsize_zero_node);
6677 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6678 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6679 gimple_assign_set_lhs (epilog_stmt, new_temp);
6680 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6681 scalar_results.safe_push (new_temp);
6683 else
6685 /* Case 3: Create:
6686 s = extract_field <v_out2, 0>
6687 for (offset = element_size;
6688 offset < vector_size;
6689 offset += element_size;)
6691 Create: s' = extract_field <v_out2, offset>
6692 Create: s = op <s, s'> // For non SLP cases
6693 } */
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_NOTE, vect_location,
6697 "Reduce using scalar code.\n");
6699 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6700 int element_bitsize = tree_to_uhwi (bitsize);
6701 tree compute_type = TREE_TYPE (vectype);
6702 gimple_seq stmts = NULL;
6703 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6705 int bit_offset;
6706 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6707 vec_temp, bitsize, bitsize_zero_node);
6709 /* In SLP we don't need to apply reduction operation, so we just
6710 collect s' values in SCALAR_RESULTS. */
6711 if (slp_reduc)
6712 scalar_results.safe_push (new_temp);
6714 for (bit_offset = element_bitsize;
6715 bit_offset < vec_size_in_bits;
6716 bit_offset += element_bitsize)
6718 tree bitpos = bitsize_int (bit_offset);
6719 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6720 compute_type, vec_temp,
6721 bitsize, bitpos);
6722 if (slp_reduc)
6724 /* In SLP we don't need to apply reduction operation, so
6725 we just collect s' values in SCALAR_RESULTS. */
6726 new_temp = new_name;
6727 scalar_results.safe_push (new_name);
6729 else
6730 new_temp = gimple_build (&stmts, code, compute_type,
6731 new_name, new_temp);
6735 /* The only case where we need to reduce scalar results in SLP, is
6736 unrolling. If the size of SCALAR_RESULTS is greater than
6737 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6738 REDUC_GROUP_SIZE. */
6739 if (slp_reduc)
6741 tree res, first_res, new_res;
6743 /* Reduce multiple scalar results in case of SLP unrolling. */
6744 for (j = group_size; scalar_results.iterate (j, &res);
6745 j++)
6747 first_res = scalar_results[j % group_size];
6748 new_res = gimple_build (&stmts, code, compute_type,
6749 first_res, res);
6750 scalar_results[j % group_size] = new_res;
6752 scalar_results.truncate (group_size);
6753 for (k = 0; k < group_size; k++)
6754 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6755 scalar_results[k]);
6757 else
6759 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6760 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6761 scalar_results.safe_push (new_temp);
6764 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6767 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6768 && induc_val)
6770 /* Earlier we set the initial value to be a vector if induc_val
6771 values. Check the result and if it is induc_val then replace
6772 with the original initial value, unless induc_val is
6773 the same as initial_def already. */
6774 tree zcompare = make_ssa_name (boolean_type_node);
6775 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6776 scalar_results[0], induc_val);
6777 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6778 tree initial_def = reduc_info->reduc_initial_values[0];
6779 tree tmp = make_ssa_name (new_scalar_dest);
6780 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6781 initial_def, scalar_results[0]);
6782 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6783 scalar_results[0] = tmp;
6787 /* 2.5 Adjust the final result by the initial value of the reduction
6788 variable. (When such adjustment is not needed, then
6789 'adjustment_def' is zero). For example, if code is PLUS we create:
6790 new_temp = loop_exit_def + adjustment_def */
6792 if (adjustment_def)
6794 gcc_assert (!slp_reduc || group_size == 1);
6795 gimple_seq stmts = NULL;
6796 if (double_reduc)
6798 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6799 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6800 new_temp = gimple_build (&stmts, code, vectype,
6801 reduc_inputs[0], adjustment_def);
6803 else
6805 new_temp = scalar_results[0];
6806 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6807 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6808 adjustment_def);
6809 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6810 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6811 new_temp, adjustment_def);
6812 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6815 epilog_stmt = gimple_seq_last_stmt (stmts);
6816 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6817 scalar_results[0] = new_temp;
6820 /* Record this operation if it could be reused by the epilogue loop. */
6821 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6822 && reduc_inputs.length () == 1)
6823 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6824 { orig_reduc_input, reduc_info });
6826 if (double_reduc)
6827 loop = outer_loop;
6829 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6830 phis with new adjusted scalar results, i.e., replace use <s_out0>
6831 with use <s_out4>.
6833 Transform:
6834 loop_exit:
6835 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6836 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6837 v_out2 = reduce <v_out1>
6838 s_out3 = extract_field <v_out2, 0>
6839 s_out4 = adjust_result <s_out3>
6840 use <s_out0>
6841 use <s_out0>
6843 into:
6845 loop_exit:
6846 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6847 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6848 v_out2 = reduce <v_out1>
6849 s_out3 = extract_field <v_out2, 0>
6850 s_out4 = adjust_result <s_out3>
6851 use <s_out4>
6852 use <s_out4> */
6854 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6855 auto_vec<gimple *> phis;
6856 for (k = 0; k < live_out_stmts.size (); k++)
6858 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6859 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6861 /* Find the loop-closed-use at the loop exit of the original scalar
6862 result. (The reduction result is expected to have two immediate uses,
6863 one at the latch block, and one at the loop exit). For double
6864 reductions we are looking for exit phis of the outer loop. */
6865 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6867 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6869 if (!is_gimple_debug (USE_STMT (use_p))
6870 && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6871 phis.safe_push (USE_STMT (use_p));
6873 else
6875 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6877 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6879 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6881 if (!flow_bb_inside_loop_p (loop,
6882 gimple_bb (USE_STMT (phi_use_p)))
6883 && !is_gimple_debug (USE_STMT (phi_use_p)))
6884 phis.safe_push (USE_STMT (phi_use_p));
6890 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6892 /* Replace the uses: */
6893 orig_name = PHI_RESULT (exit_phi);
6895 /* Look for a single use at the target of the skip edge. */
6896 if (unify_with_main_loop_p)
6898 use_operand_p use_p;
6899 gimple *user;
6900 if (!single_imm_use (orig_name, &use_p, &user))
6901 gcc_unreachable ();
6902 orig_name = gimple_get_lhs (user);
6905 scalar_result = scalar_results[k];
6906 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6908 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6909 SET_USE (use_p, scalar_result);
6910 update_stmt (use_stmt);
6914 phis.truncate (0);
6918 /* Return a vector of type VECTYPE that is equal to the vector select
6919 operation "MASK ? VEC : IDENTITY". Insert the select statements
6920 before GSI. */
6922 static tree
6923 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6924 tree vec, tree identity)
6926 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6927 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6928 mask, vec, identity);
6929 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6930 return cond;
6933 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6934 order, starting with LHS. Insert the extraction statements before GSI and
6935 associate the new scalar SSA names with variable SCALAR_DEST.
6936 If MASK is nonzero mask the input and then operate on it unconditionally.
6937 Return the SSA name for the result. */
6939 static tree
6940 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6941 tree_code code, tree lhs, tree vector_rhs,
6942 tree mask)
6944 tree vectype = TREE_TYPE (vector_rhs);
6945 tree scalar_type = TREE_TYPE (vectype);
6946 tree bitsize = TYPE_SIZE (scalar_type);
6947 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6948 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6950 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6951 to perform an unconditional element-wise reduction of it. */
6952 if (mask)
6954 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6955 "masked_vector_rhs");
6956 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6957 false);
6958 tree vector_identity = build_vector_from_val (vectype, neutral_op);
6959 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6960 mask, vector_rhs, vector_identity);
6961 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6962 vector_rhs = masked_vector_rhs;
6965 for (unsigned HOST_WIDE_INT bit_offset = 0;
6966 bit_offset < vec_size_in_bits;
6967 bit_offset += element_bitsize)
6969 tree bitpos = bitsize_int (bit_offset);
6970 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6971 bitsize, bitpos);
6973 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6974 rhs = make_ssa_name (scalar_dest, stmt);
6975 gimple_assign_set_lhs (stmt, rhs);
6976 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6978 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6979 tree new_name = make_ssa_name (scalar_dest, stmt);
6980 gimple_assign_set_lhs (stmt, new_name);
6981 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6982 lhs = new_name;
6984 return lhs;
6987 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6988 type of the vector input. */
6990 static internal_fn
6991 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6993 internal_fn mask_reduc_fn;
6994 internal_fn mask_len_reduc_fn;
6996 switch (reduc_fn)
6998 case IFN_FOLD_LEFT_PLUS:
6999 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7000 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7001 break;
7003 default:
7004 return IFN_LAST;
7007 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7008 OPTIMIZE_FOR_SPEED))
7009 return mask_reduc_fn;
7010 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7011 OPTIMIZE_FOR_SPEED))
7012 return mask_len_reduc_fn;
7013 return IFN_LAST;
7016 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7017 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7018 statement. CODE is the operation performed by STMT_INFO and OPS are
7019 its scalar operands. REDUC_INDEX is the index of the operand in
7020 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7021 implements in-order reduction, or IFN_LAST if we should open-code it.
7022 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7023 that should be used to control the operation in a fully-masked loop. */
7025 static bool
7026 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7027 stmt_vec_info stmt_info,
7028 gimple_stmt_iterator *gsi,
7029 gimple **vec_stmt, slp_tree slp_node,
7030 gimple *reduc_def_stmt,
7031 code_helper code, internal_fn reduc_fn,
7032 tree *ops, int num_ops, tree vectype_in,
7033 int reduc_index, vec_loop_masks *masks,
7034 vec_loop_lens *lens)
7036 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7037 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7038 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7040 int ncopies;
7041 if (slp_node)
7042 ncopies = 1;
7043 else
7044 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7046 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7047 gcc_assert (ncopies == 1);
7049 bool is_cond_op = false;
7050 if (!code.is_tree_code ())
7052 code = conditional_internal_fn_code (internal_fn (code));
7053 gcc_assert (code != ERROR_MARK);
7054 is_cond_op = true;
7057 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7059 if (slp_node)
7060 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7061 TYPE_VECTOR_SUBPARTS (vectype_in)));
7063 /* The operands either come from a binary operation or an IFN_COND operation.
7064 The former is a gimple assign with binary rhs and the latter is a
7065 gimple call with four arguments. */
7066 gcc_assert (num_ops == 2 || num_ops == 4);
7068 int group_size = 1;
7069 stmt_vec_info scalar_dest_def_info;
7070 auto_vec<tree> vec_oprnds0, vec_opmask;
7071 if (slp_node)
7073 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
7074 + (1 - reduc_index)],
7075 &vec_oprnds0);
7076 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7077 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7078 /* For an IFN_COND_OP we also need the vector mask operand. */
7079 if (is_cond_op)
7080 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
7082 else
7084 tree op0, opmask;
7085 if (!is_cond_op)
7086 op0 = ops[1 - reduc_index];
7087 else
7089 op0 = ops[2 + (1 - reduc_index)];
7090 opmask = ops[0];
7092 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7093 op0, &vec_oprnds0);
7094 scalar_dest_def_info = stmt_info;
7096 /* For an IFN_COND_OP we also need the vector mask operand. */
7097 if (is_cond_op)
7098 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7099 opmask, &vec_opmask);
7102 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7103 tree scalar_dest = gimple_get_lhs (sdef);
7104 tree scalar_type = TREE_TYPE (scalar_dest);
7105 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7107 int vec_num = vec_oprnds0.length ();
7108 gcc_assert (vec_num == 1 || slp_node);
7109 tree vec_elem_type = TREE_TYPE (vectype_out);
7110 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7112 tree vector_identity = NULL_TREE;
7113 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7115 vector_identity = build_zero_cst (vectype_out);
7116 if (!HONOR_SIGNED_ZEROS (vectype_out))
7118 else
7120 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7121 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7122 vector_identity);
7126 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7127 int i;
7128 tree def0;
7129 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7131 gimple *new_stmt;
7132 tree mask = NULL_TREE;
7133 tree len = NULL_TREE;
7134 tree bias = NULL_TREE;
7135 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7137 tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7138 vec_num, vectype_in, i);
7139 if (is_cond_op)
7140 mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
7141 loop_mask, vec_opmask[i], gsi);
7142 else
7143 mask = loop_mask;
7145 else if (is_cond_op)
7146 mask = vec_opmask[i];
7147 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7149 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7150 i, 1);
7151 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7152 bias = build_int_cst (intQI_type_node, biasval);
7153 if (!is_cond_op)
7154 mask = build_minus_one_cst (truth_type_for (vectype_in));
7157 /* Handle MINUS by adding the negative. */
7158 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7160 tree negated = make_ssa_name (vectype_out);
7161 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7162 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7163 def0 = negated;
7166 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7167 && mask && mask_reduc_fn == IFN_LAST)
7168 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7169 vector_identity);
7171 /* On the first iteration the input is simply the scalar phi
7172 result, and for subsequent iterations it is the output of
7173 the preceding operation. */
7174 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7176 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7177 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7178 def0, mask, len, bias);
7179 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7180 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7181 def0, mask);
7182 else
7183 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7184 def0);
7185 /* For chained SLP reductions the output of the previous reduction
7186 operation serves as the input of the next. For the final statement
7187 the output cannot be a temporary - we reuse the original
7188 scalar destination of the last statement. */
7189 if (i != vec_num - 1)
7191 gimple_set_lhs (new_stmt, scalar_dest_var);
7192 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7193 gimple_set_lhs (new_stmt, reduc_var);
7196 else
7198 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7199 tree_code (code), reduc_var, def0,
7200 mask);
7201 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7202 /* Remove the statement, so that we can use the same code paths
7203 as for statements that we've just created. */
7204 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7205 gsi_remove (&tmp_gsi, true);
7208 if (i == vec_num - 1)
7210 gimple_set_lhs (new_stmt, scalar_dest);
7211 vect_finish_replace_stmt (loop_vinfo,
7212 scalar_dest_def_info,
7213 new_stmt);
7215 else
7216 vect_finish_stmt_generation (loop_vinfo,
7217 scalar_dest_def_info,
7218 new_stmt, gsi);
7220 if (slp_node)
7221 slp_node->push_vec_def (new_stmt);
7222 else
7224 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7225 *vec_stmt = new_stmt;
7229 return true;
7232 /* Function is_nonwrapping_integer_induction.
7234 Check if STMT_VINO (which is part of loop LOOP) both increments and
7235 does not cause overflow. */
7237 static bool
7238 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7240 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7241 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7242 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7243 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7244 widest_int ni, max_loop_value, lhs_max;
7245 wi::overflow_type overflow = wi::OVF_NONE;
7247 /* Make sure the loop is integer based. */
7248 if (TREE_CODE (base) != INTEGER_CST
7249 || TREE_CODE (step) != INTEGER_CST)
7250 return false;
7252 /* Check that the max size of the loop will not wrap. */
7254 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7255 return true;
7257 if (! max_stmt_executions (loop, &ni))
7258 return false;
7260 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7261 &overflow);
7262 if (overflow)
7263 return false;
7265 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7266 TYPE_SIGN (lhs_type), &overflow);
7267 if (overflow)
7268 return false;
7270 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7271 <= TYPE_PRECISION (lhs_type));
7274 /* Check if masking can be supported by inserting a conditional expression.
7275 CODE is the code for the operation. COND_FN is the conditional internal
7276 function, if it exists. VECTYPE_IN is the type of the vector input. */
7277 static bool
7278 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7279 tree vectype_in)
7281 if (cond_fn != IFN_LAST
7282 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7283 OPTIMIZE_FOR_SPEED))
7284 return false;
7286 if (code.is_tree_code ())
7287 switch (tree_code (code))
7289 case DOT_PROD_EXPR:
7290 case SAD_EXPR:
7291 return true;
7293 default:
7294 break;
7296 return false;
7299 /* Insert a conditional expression to enable masked vectorization. CODE is the
7300 code for the operation. VOP is the array of operands. MASK is the loop
7301 mask. GSI is a statement iterator used to place the new conditional
7302 expression. */
7303 static void
7304 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7305 gimple_stmt_iterator *gsi)
7307 switch (tree_code (code))
7309 case DOT_PROD_EXPR:
7311 tree vectype = TREE_TYPE (vop[1]);
7312 tree zero = build_zero_cst (vectype);
7313 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7314 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7315 mask, vop[1], zero);
7316 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7317 vop[1] = masked_op1;
7318 break;
7321 case SAD_EXPR:
7323 tree vectype = TREE_TYPE (vop[1]);
7324 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7325 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7326 mask, vop[1], vop[0]);
7327 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7328 vop[1] = masked_op1;
7329 break;
7332 default:
7333 gcc_unreachable ();
7337 /* Given an operation with CODE in loop reduction path whose reduction PHI is
7338 specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7339 input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7340 may be different from VECTYPE_IN, either in base type or vectype lanes,
7341 lane-reducing operation is the case. This function check if it is possible,
7342 and how to perform partial vectorization on the operation in the context
7343 of LOOP_VINFO. */
7345 static void
7346 vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
7347 stmt_vec_info reduc_info,
7348 slp_tree slp_node,
7349 code_helper code, tree type,
7350 tree vectype_in)
7352 enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7353 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7354 internal_fn cond_fn = get_conditional_internal_fn (code, type);
7356 if (reduc_type != FOLD_LEFT_REDUCTION
7357 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7358 && (cond_fn == IFN_LAST
7359 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7360 OPTIMIZE_FOR_SPEED)))
7362 if (dump_enabled_p ())
7363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7364 "can't operate on partial vectors because"
7365 " no conditional operation is available.\n");
7366 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7368 else if (reduc_type == FOLD_LEFT_REDUCTION
7369 && reduc_fn == IFN_LAST
7370 && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in),
7371 SSA_NAME))
7373 if (dump_enabled_p ())
7374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7375 "can't operate on partial vectors because"
7376 " no conditional operation is available.\n");
7377 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7379 else if (reduc_type == FOLD_LEFT_REDUCTION
7380 && internal_fn_mask_index (reduc_fn) == -1
7381 && FLOAT_TYPE_P (vectype_in)
7382 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
7384 if (dump_enabled_p ())
7385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7386 "can't operate on partial vectors because"
7387 " signed zeros cannot be preserved.\n");
7388 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7390 else
7392 internal_fn mask_reduc_fn
7393 = get_masked_reduction_fn (reduc_fn, vectype_in);
7394 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7395 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7396 unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node,
7397 vectype_in);
7399 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7400 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
7401 else
7402 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
7406 /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7407 the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7408 and the analysis is for slp if SLP_NODE is not NULL.
7410 For a lane-reducing operation, the loop reduction path that it lies in,
7411 may contain normal operation, or other lane-reducing operation of different
7412 input type size, an example as:
7414 int sum = 0;
7415 for (i)
7418 sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7419 sum += w[i]; // widen-sum <vector(16) char>
7420 sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7421 sum += n[i]; // normal <vector(4) int>
7425 Vectorization factor is essentially determined by operation whose input
7426 vectype has the most lanes ("vector(16) char" in the example), while we
7427 need to choose input vectype with the least lanes ("vector(4) int" in the
7428 example) to determine effective number of vector reduction PHIs. */
7430 bool
7431 vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7432 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7434 gimple *stmt = stmt_info->stmt;
7436 if (!lane_reducing_stmt_p (stmt))
7437 return false;
7439 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
7441 if (!INTEGRAL_TYPE_P (type))
7442 return false;
7444 /* Do not try to vectorize bit-precision reductions. */
7445 if (!type_has_mode_precision_p (type))
7446 return false;
7448 stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7450 /* TODO: Support lane-reducing operation that does not directly participate
7451 in loop reduction. */
7452 if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
7453 return false;
7455 /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7456 recoginized. */
7457 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
7458 gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
7460 for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
7462 stmt_vec_info def_stmt_info;
7463 slp_tree slp_op;
7464 tree op;
7465 tree vectype;
7466 enum vect_def_type dt;
7468 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
7469 &slp_op, &dt, &vectype, &def_stmt_info))
7471 if (dump_enabled_p ())
7472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7473 "use not simple.\n");
7474 return false;
7477 if (!vectype)
7479 vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
7480 slp_op);
7481 if (!vectype)
7482 return false;
7485 if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
7487 if (dump_enabled_p ())
7488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7489 "incompatible vector types for invariants\n");
7490 return false;
7493 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7494 continue;
7496 /* There should be at most one cycle def in the stmt. */
7497 if (VECTORIZABLE_CYCLE_DEF (dt))
7498 return false;
7501 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
7503 gcc_assert (vectype_in);
7505 /* Compute number of effective vector statements for costing. */
7506 unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
7507 vectype_in);
7508 gcc_assert (ncopies_for_cost >= 1);
7510 if (vect_is_emulated_mixed_dot_prod (stmt_info))
7512 /* We need extra two invariants: one that contains the minimum signed
7513 value and one that contains half of its negative. */
7514 int prologue_stmts = 2;
7515 unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
7516 scalar_to_vec, stmt_info, 0,
7517 vect_prologue);
7518 if (dump_enabled_p ())
7519 dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
7520 "extra prologue_cost = %d .\n", cost);
7522 /* Three dot-products and a subtraction. */
7523 ncopies_for_cost *= 4;
7526 record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
7527 0, vect_body);
7529 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7531 enum tree_code code = gimple_assign_rhs_code (stmt);
7532 vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7533 slp_node, code, type,
7534 vectype_in);
7537 /* Transform via vect_transform_reduction. */
7538 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7539 return true;
7542 /* Function vectorizable_reduction.
7544 Check if STMT_INFO performs a reduction operation that can be vectorized.
7545 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7546 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7547 Return true if STMT_INFO is vectorizable in this way.
7549 This function also handles reduction idioms (patterns) that have been
7550 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7551 may be of this form:
7552 X = pattern_expr (arg0, arg1, ..., X)
7553 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7554 sequence that had been detected and replaced by the pattern-stmt
7555 (STMT_INFO).
7557 This function also handles reduction of condition expressions, for example:
7558 for (int i = 0; i < N; i++)
7559 if (a[i] < value)
7560 last = a[i];
7561 This is handled by vectorising the loop and creating an additional vector
7562 containing the loop indexes for which "a[i] < value" was true. In the
7563 function epilogue this is reduced to a single max value and then used to
7564 index into the vector of results.
7566 In some cases of reduction patterns, the type of the reduction variable X is
7567 different than the type of the other arguments of STMT_INFO.
7568 In such cases, the vectype that is used when transforming STMT_INFO into
7569 a vector stmt is different than the vectype that is used to determine the
7570 vectorization factor, because it consists of a different number of elements
7571 than the actual number of elements that are being operated upon in parallel.
7573 For example, consider an accumulation of shorts into an int accumulator.
7574 On some targets it's possible to vectorize this pattern operating on 8
7575 shorts at a time (hence, the vectype for purposes of determining the
7576 vectorization factor should be V8HI); on the other hand, the vectype that
7577 is used to create the vector form is actually V4SI (the type of the result).
7579 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7580 indicates what is the actual level of parallelism (V8HI in the example), so
7581 that the right vectorization factor would be derived. This vectype
7582 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7583 be used to create the vectorized stmt. The right vectype for the vectorized
7584 stmt is obtained from the type of the result X:
7585 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7587 This means that, contrary to "regular" reductions (or "regular" stmts in
7588 general), the following equation:
7589 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7590 does *NOT* necessarily hold for reduction patterns. */
7592 bool
7593 vectorizable_reduction (loop_vec_info loop_vinfo,
7594 stmt_vec_info stmt_info, slp_tree slp_node,
7595 slp_instance slp_node_instance,
7596 stmt_vector_for_cost *cost_vec)
7598 tree vectype_in = NULL_TREE;
7599 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7600 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7601 stmt_vec_info cond_stmt_vinfo = NULL;
7602 int i;
7603 int ncopies;
7604 bool single_defuse_cycle = false;
7605 bool nested_cycle = false;
7606 bool double_reduc = false;
7607 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7608 tree cond_reduc_val = NULL_TREE;
7610 /* Make sure it was already recognized as a reduction computation. */
7611 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7612 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7613 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7614 return false;
7616 /* The stmt we store reduction analysis meta on. */
7617 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7618 reduc_info->is_reduc_info = true;
7620 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7622 if (is_a <gphi *> (stmt_info->stmt))
7624 if (slp_node)
7626 /* We eventually need to set a vector type on invariant
7627 arguments. */
7628 unsigned j;
7629 slp_tree child;
7630 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7631 if (!vect_maybe_update_slp_op_vectype
7632 (child, SLP_TREE_VECTYPE (slp_node)))
7634 if (dump_enabled_p ())
7635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7636 "incompatible vector types for "
7637 "invariants\n");
7638 return false;
7641 /* Analysis for double-reduction is done on the outer
7642 loop PHI, nested cycles have no further restrictions. */
7643 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7645 else
7646 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7647 return true;
7650 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7651 stmt_vec_info phi_info = stmt_info;
7652 if (!is_a <gphi *> (stmt_info->stmt))
7654 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7655 return true;
7657 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7659 if (gimple_bb (stmt_info->stmt) != loop->header)
7661 /* For SLP we arrive here for both the inner loop LC PHI and
7662 the outer loop PHI. The latter is what we want to analyze
7663 the reduction with. */
7664 gcc_assert (slp_node);
7665 return true;
7667 use_operand_p use_p;
7668 gimple *use_stmt;
7669 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7670 &use_p, &use_stmt);
7671 gcc_assert (res);
7672 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7675 if (slp_node)
7677 slp_node_instance->reduc_phis = slp_node;
7678 /* ??? We're leaving slp_node to point to the PHIs, we only
7679 need it to get at the number of vector stmts which wasn't
7680 yet initialized for the instance root. */
7683 /* PHIs should not participate in patterns. */
7684 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7685 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7687 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7688 and compute the reduction chain length. Discover the real
7689 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7690 tree reduc_def
7691 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7692 loop_latch_edge
7693 (gimple_bb (reduc_def_phi)->loop_father));
7694 unsigned reduc_chain_length = 0;
7695 bool only_slp_reduc_chain = true;
7696 stmt_info = NULL;
7697 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7698 /* For double-reductions we start SLP analysis at the inner loop LC PHI
7699 which is the def of the outer loop live stmt. */
7700 if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
7701 && slp_node)
7702 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7703 while (reduc_def != PHI_RESULT (reduc_def_phi))
7705 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7706 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7707 int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7709 if (reduc_idx == -1)
7711 if (dump_enabled_p ())
7712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713 "reduction chain broken by patterns.\n");
7714 return false;
7716 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7717 only_slp_reduc_chain = false;
7718 /* For epilogue generation live members of the chain need
7719 to point back to the PHI via their original stmt for
7720 info_for_reduction to work. For SLP we need to look at
7721 all lanes here - even though we only will vectorize from
7722 the SLP node with live lane zero the other live lanes also
7723 need to be identified as part of a reduction to be able
7724 to skip code generation for them. */
7725 if (slp_for_stmt_info)
7727 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7728 if (STMT_VINFO_LIVE_P (s))
7729 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7731 else if (STMT_VINFO_LIVE_P (vdef))
7732 STMT_VINFO_REDUC_DEF (def) = phi_info;
7733 gimple_match_op op;
7734 if (!gimple_extract_op (vdef->stmt, &op))
7736 if (dump_enabled_p ())
7737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7738 "reduction chain includes unsupported"
7739 " statement type.\n");
7740 return false;
7742 if (CONVERT_EXPR_CODE_P (op.code))
7744 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7746 if (dump_enabled_p ())
7747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7748 "conversion in the reduction chain.\n");
7749 return false;
7752 else
7754 /* First non-conversion stmt. */
7755 if (!stmt_info)
7756 stmt_info = vdef;
7758 if (lane_reducing_op_p (op.code))
7760 enum vect_def_type dt;
7761 tree vectype_op;
7763 /* The last operand of lane-reducing operation is for
7764 reduction. */
7765 gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7767 if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
7768 return false;
7770 tree type_op = TREE_TYPE (op.ops[0]);
7772 if (!vectype_op)
7774 vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7775 type_op);
7776 if (!vectype_op)
7777 return false;
7780 /* For lane-reducing operation vectorizable analysis needs the
7781 reduction PHI information. */
7782 STMT_VINFO_REDUC_DEF (def) = phi_info;
7784 /* Each lane-reducing operation has its own input vectype, while
7785 reduction PHI will record the input vectype with the least
7786 lanes. */
7787 STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
7789 /* To accommodate lane-reducing operations of mixed input
7790 vectypes, choose input vectype with the least lanes for the
7791 reduction PHI statement, which would result in the most
7792 ncopies for vectorized reduction results. */
7793 if (!vectype_in
7794 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7795 < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7796 vectype_in = vectype_op;
7798 else
7799 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7802 reduc_def = op.ops[reduc_idx];
7803 reduc_chain_length++;
7804 if (!stmt_info && slp_node)
7805 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7807 /* PHIs should not participate in patterns. */
7808 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7810 if (nested_in_vect_loop_p (loop, stmt_info))
7812 loop = loop->inner;
7813 nested_cycle = true;
7816 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7817 element. */
7818 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7820 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7821 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7823 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7824 gcc_assert (slp_node
7825 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7827 /* 1. Is vectorizable reduction? */
7828 /* Not supportable if the reduction variable is used in the loop, unless
7829 it's a reduction chain. */
7830 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7831 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7832 return false;
7834 /* Reductions that are not used even in an enclosing outer-loop,
7835 are expected to be "live" (used out of the loop). */
7836 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7837 && !STMT_VINFO_LIVE_P (stmt_info))
7838 return false;
7840 /* 2. Has this been recognized as a reduction pattern?
7842 Check if STMT represents a pattern that has been recognized
7843 in earlier analysis stages. For stmts that represent a pattern,
7844 the STMT_VINFO_RELATED_STMT field records the last stmt in
7845 the original sequence that constitutes the pattern. */
7847 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7848 if (orig_stmt_info)
7850 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7851 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7854 /* 3. Check the operands of the operation. The first operands are defined
7855 inside the loop body. The last operand is the reduction variable,
7856 which is defined by the loop-header-phi. */
7858 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7859 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7860 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7862 gimple_match_op op;
7863 if (!gimple_extract_op (stmt_info->stmt, &op))
7864 gcc_unreachable ();
7865 bool lane_reducing = lane_reducing_op_p (op.code);
7867 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7868 && !SCALAR_FLOAT_TYPE_P (op.type))
7869 return false;
7871 /* Do not try to vectorize bit-precision reductions. */
7872 if (!type_has_mode_precision_p (op.type))
7873 return false;
7875 /* Lane-reducing ops also never can be used in a SLP reduction group
7876 since we'll mix lanes belonging to different reductions. But it's
7877 OK to use them in a reduction chain or when the reduction group
7878 has just one element. */
7879 if (lane_reducing
7880 && slp_node
7881 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7882 && SLP_TREE_LANES (slp_node) > 1)
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "lane-reducing reduction in reduction group.\n");
7887 return false;
7890 /* All uses but the last are expected to be defined in the loop.
7891 The last use is the reduction variable. In case of nested cycle this
7892 assumption is not true: we use reduc_index to record the index of the
7893 reduction variable. */
7894 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7895 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7896 /* We need to skip an extra operand for COND_EXPRs with embedded
7897 comparison. */
7898 unsigned opno_adjust = 0;
7899 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7900 opno_adjust = 1;
7901 for (i = 0; i < (int) op.num_ops; i++)
7903 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7904 if (i == 0 && op.code == COND_EXPR)
7905 continue;
7907 stmt_vec_info def_stmt_info;
7908 enum vect_def_type dt;
7909 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7910 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7911 &vectype_op[i], &def_stmt_info))
7913 if (dump_enabled_p ())
7914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915 "use not simple.\n");
7916 return false;
7919 /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7920 reduction operand twice (once as definition, once as else). */
7921 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7922 continue;
7924 /* There should be only one cycle def in the stmt, the one
7925 leading to reduc_def. */
7926 if (VECTORIZABLE_CYCLE_DEF (dt))
7927 return false;
7929 if (!vectype_op[i])
7930 vectype_op[i]
7931 = get_vectype_for_scalar_type (loop_vinfo,
7932 TREE_TYPE (op.ops[i]), slp_op[i]);
7934 /* Record how the non-reduction-def value of COND_EXPR is defined.
7935 ??? For a chain of multiple CONDs we'd have to match them up all. */
7936 if (op.code == COND_EXPR && reduc_chain_length == 1)
7938 if (dt == vect_constant_def)
7940 cond_reduc_dt = dt;
7941 cond_reduc_val = op.ops[i];
7943 else if (dt == vect_induction_def
7944 && def_stmt_info
7945 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7947 cond_reduc_dt = dt;
7948 cond_stmt_vinfo = def_stmt_info;
7953 enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
7954 STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
7955 /* If we have a condition reduction, see if we can simplify it further. */
7956 if (reduction_type == COND_REDUCTION)
7958 if (slp_node && SLP_TREE_LANES (slp_node) != 1)
7959 return false;
7961 /* When the condition uses the reduction value in the condition, fail. */
7962 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7964 if (dump_enabled_p ())
7965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966 "condition depends on previous iteration\n");
7967 return false;
7970 if (reduc_chain_length == 1
7971 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7972 OPTIMIZE_FOR_SPEED)
7973 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7974 vectype_in,
7975 OPTIMIZE_FOR_SPEED)))
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 "optimizing condition reduction with"
7980 " FOLD_EXTRACT_LAST.\n");
7981 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7983 else if (cond_reduc_dt == vect_induction_def)
7985 tree base
7986 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7987 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7989 gcc_assert (TREE_CODE (base) == INTEGER_CST
7990 && TREE_CODE (step) == INTEGER_CST);
7991 cond_reduc_val = NULL_TREE;
7992 enum tree_code cond_reduc_op_code = ERROR_MARK;
7993 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7994 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7996 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7997 above base; punt if base is the minimum value of the type for
7998 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7999 else if (tree_int_cst_sgn (step) == -1)
8001 cond_reduc_op_code = MIN_EXPR;
8002 if (tree_int_cst_sgn (base) == -1)
8003 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8004 else if (tree_int_cst_lt (base,
8005 TYPE_MAX_VALUE (TREE_TYPE (base))))
8006 cond_reduc_val
8007 = int_const_binop (PLUS_EXPR, base, integer_one_node);
8009 else
8011 cond_reduc_op_code = MAX_EXPR;
8012 if (tree_int_cst_sgn (base) == 1)
8013 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8014 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
8015 base))
8016 cond_reduc_val
8017 = int_const_binop (MINUS_EXPR, base, integer_one_node);
8019 if (cond_reduc_val)
8021 if (dump_enabled_p ())
8022 dump_printf_loc (MSG_NOTE, vect_location,
8023 "condition expression based on "
8024 "integer induction.\n");
8025 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
8026 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
8027 = cond_reduc_val;
8028 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
8031 else if (cond_reduc_dt == vect_constant_def)
8033 enum vect_def_type cond_initial_dt;
8034 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
8035 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
8036 if (cond_initial_dt == vect_constant_def
8037 && types_compatible_p (TREE_TYPE (cond_initial_val),
8038 TREE_TYPE (cond_reduc_val)))
8040 tree e = fold_binary (LE_EXPR, boolean_type_node,
8041 cond_initial_val, cond_reduc_val);
8042 if (e && (integer_onep (e) || integer_zerop (e)))
8044 if (dump_enabled_p ())
8045 dump_printf_loc (MSG_NOTE, vect_location,
8046 "condition expression based on "
8047 "compile time constant.\n");
8048 /* Record reduction code at analysis stage. */
8049 STMT_VINFO_REDUC_CODE (reduc_info)
8050 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
8051 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
8057 if (STMT_VINFO_LIVE_P (phi_info))
8058 return false;
8060 if (slp_node)
8061 ncopies = 1;
8062 else
8063 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8065 gcc_assert (ncopies >= 1);
8067 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
8069 if (nested_cycle)
8071 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
8072 == vect_double_reduction_def);
8073 double_reduc = true;
8076 /* 4.2. Check support for the epilog operation.
8078 If STMT represents a reduction pattern, then the type of the
8079 reduction variable may be different than the type of the rest
8080 of the arguments. For example, consider the case of accumulation
8081 of shorts into an int accumulator; The original code:
8082 S1: int_a = (int) short_a;
8083 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
8085 was replaced with:
8086 STMT: int_acc = widen_sum <short_a, int_acc>
8088 This means that:
8089 1. The tree-code that is used to create the vector operation in the
8090 epilog code (that reduces the partial results) is not the
8091 tree-code of STMT, but is rather the tree-code of the original
8092 stmt from the pattern that STMT is replacing. I.e, in the example
8093 above we want to use 'widen_sum' in the loop, but 'plus' in the
8094 epilog.
8095 2. The type (mode) we use to check available target support
8096 for the vector operation to be created in the *epilog*, is
8097 determined by the type of the reduction variable (in the example
8098 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
8099 However the type (mode) we use to check available target support
8100 for the vector operation to be created *inside the loop*, is
8101 determined by the type of the other arguments to STMT (in the
8102 example we'd check this: optab_handler (widen_sum_optab,
8103 vect_short_mode)).
8105 This is contrary to "regular" reductions, in which the types of all
8106 the arguments are the same as the type of the reduction variable.
8107 For "regular" reductions we can therefore use the same vector type
8108 (and also the same tree-code) when generating the epilog code and
8109 when generating the code inside the loop. */
8111 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
8113 /* If conversion might have created a conditional operation like
8114 IFN_COND_ADD already. Use the internal code for the following checks. */
8115 if (orig_code.is_internal_fn ())
8117 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
8118 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
8121 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
8123 reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8124 if (reduction_type == TREE_CODE_REDUCTION)
8126 /* Check whether it's ok to change the order of the computation.
8127 Generally, when vectorizing a reduction we change the order of the
8128 computation. This may change the behavior of the program in some
8129 cases, so we need to check that this is ok. One exception is when
8130 vectorizing an outer-loop: the inner-loop is executed sequentially,
8131 and therefore vectorizing reductions in the inner-loop during
8132 outer-loop vectorization is safe. Likewise when we are vectorizing
8133 a series of reductions using SLP and the VF is one the reductions
8134 are performed in scalar order. */
8135 if (slp_node
8136 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8137 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
8139 else if (needs_fold_left_reduction_p (op.type, orig_code))
8141 /* When vectorizing a reduction chain w/o SLP the reduction PHI
8142 is not directy used in stmt. */
8143 if (!only_slp_reduc_chain
8144 && reduc_chain_length != 1)
8146 if (dump_enabled_p ())
8147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148 "in-order reduction chain without SLP.\n");
8149 return false;
8151 STMT_VINFO_REDUC_TYPE (reduc_info)
8152 = reduction_type = FOLD_LEFT_REDUCTION;
8154 else if (!commutative_binary_op_p (orig_code, op.type)
8155 || !associative_binary_op_p (orig_code, op.type))
8157 if (dump_enabled_p ())
8158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159 "reduction: not commutative/associative\n");
8160 return false;
8164 if ((reduction_type == COND_REDUCTION
8165 || reduction_type == INTEGER_INDUC_COND_REDUCTION
8166 || reduction_type == CONST_COND_REDUCTION
8167 || reduction_type == EXTRACT_LAST_REDUCTION)
8168 && slp_node
8169 && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
8171 if (dump_enabled_p ())
8172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8173 "multiple types in condition reduction.\n");
8174 return false;
8177 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
8178 && ncopies > 1)
8180 if (dump_enabled_p ())
8181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8182 "multiple types in double reduction or condition "
8183 "reduction or fold-left reduction.\n");
8184 return false;
8187 internal_fn reduc_fn = IFN_LAST;
8188 if (reduction_type == TREE_CODE_REDUCTION
8189 || reduction_type == FOLD_LEFT_REDUCTION
8190 || reduction_type == INTEGER_INDUC_COND_REDUCTION
8191 || reduction_type == CONST_COND_REDUCTION)
8193 if (reduction_type == FOLD_LEFT_REDUCTION
8194 ? fold_left_reduction_fn (orig_code, &reduc_fn)
8195 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8197 if (reduc_fn != IFN_LAST
8198 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8199 OPTIMIZE_FOR_SPEED))
8201 if (dump_enabled_p ())
8202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8203 "reduc op not supported by target.\n");
8205 reduc_fn = IFN_LAST;
8208 else
8210 if (!nested_cycle || double_reduc)
8212 if (dump_enabled_p ())
8213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214 "no reduc code for scalar code.\n");
8216 return false;
8220 else if (reduction_type == COND_REDUCTION)
8222 int scalar_precision
8223 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8224 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8225 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8226 vectype_out);
8228 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8229 OPTIMIZE_FOR_SPEED))
8230 reduc_fn = IFN_REDUC_MAX;
8232 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8234 if (reduction_type != EXTRACT_LAST_REDUCTION
8235 && (!nested_cycle || double_reduc)
8236 && reduc_fn == IFN_LAST
8237 && !nunits_out.is_constant ())
8239 if (dump_enabled_p ())
8240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8241 "missing target support for reduction on"
8242 " variable-length vectors.\n");
8243 return false;
8246 /* For SLP reductions, see if there is a neutral value we can use. */
8247 tree neutral_op = NULL_TREE;
8248 if (slp_node)
8250 tree initial_value = NULL_TREE;
8251 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8252 initial_value = vect_phi_initial_value (reduc_def_phi);
8253 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8254 orig_code, initial_value);
8257 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8259 /* We can't support in-order reductions of code such as this:
8261 for (int i = 0; i < n1; ++i)
8262 for (int j = 0; j < n2; ++j)
8263 l += a[j];
8265 since GCC effectively transforms the loop when vectorizing:
8267 for (int i = 0; i < n1 / VF; ++i)
8268 for (int j = 0; j < n2; ++j)
8269 for (int k = 0; k < VF; ++k)
8270 l += a[j];
8272 which is a reassociation of the original operation. */
8273 if (dump_enabled_p ())
8274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8275 "in-order double reduction not supported.\n");
8277 return false;
8280 if (reduction_type == FOLD_LEFT_REDUCTION
8281 && (slp_node && SLP_TREE_LANES (slp_node) > 1)
8282 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8284 /* We cannot use in-order reductions in this case because there is
8285 an implicit reassociation of the operations involved. */
8286 if (dump_enabled_p ())
8287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8288 "in-order unchained SLP reductions not supported.\n");
8289 return false;
8292 /* For double reductions, and for SLP reductions with a neutral value,
8293 we construct a variable-length initial vector by loading a vector
8294 full of the neutral value and then shift-and-inserting the start
8295 values into the low-numbered elements. */
8296 if ((double_reduc || neutral_op)
8297 && !nunits_out.is_constant ()
8298 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8299 vectype_out, OPTIMIZE_FOR_SPEED))
8301 if (dump_enabled_p ())
8302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8303 "reduction on variable-length vectors requires"
8304 " target support for a vector-shift-and-insert"
8305 " operation.\n");
8306 return false;
8309 /* Check extra constraints for variable-length unchained SLP reductions. */
8310 if (slp_node
8311 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8312 && !nunits_out.is_constant ())
8314 /* We checked above that we could build the initial vector when
8315 there's a neutral element value. Check here for the case in
8316 which each SLP statement has its own initial value and in which
8317 that value needs to be repeated for every instance of the
8318 statement within the initial vector. */
8319 unsigned int group_size = SLP_TREE_LANES (slp_node);
8320 if (!neutral_op
8321 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8322 TREE_TYPE (vectype_out)))
8324 if (dump_enabled_p ())
8325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326 "unsupported form of SLP reduction for"
8327 " variable-length vectors: cannot build"
8328 " initial vector.\n");
8329 return false;
8331 /* The epilogue code relies on the number of elements being a multiple
8332 of the group size. The duplicate-and-interleave approach to setting
8333 up the initial vector does too. */
8334 if (!multiple_p (nunits_out, group_size))
8336 if (dump_enabled_p ())
8337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8338 "unsupported form of SLP reduction for"
8339 " variable-length vectors: the vector size"
8340 " is not a multiple of the number of results.\n");
8341 return false;
8345 if (reduction_type == COND_REDUCTION)
8347 widest_int ni;
8349 if (! max_loop_iterations (loop, &ni))
8351 if (dump_enabled_p ())
8352 dump_printf_loc (MSG_NOTE, vect_location,
8353 "loop count not known, cannot create cond "
8354 "reduction.\n");
8355 return false;
8357 /* Convert backedges to iterations. */
8358 ni += 1;
8360 /* The additional index will be the same type as the condition. Check
8361 that the loop can fit into this less one (because we'll use up the
8362 zero slot for when there are no matches). */
8363 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8364 if (wi::geu_p (ni, wi::to_widest (max_index)))
8366 if (dump_enabled_p ())
8367 dump_printf_loc (MSG_NOTE, vect_location,
8368 "loop size is greater than data size.\n");
8369 return false;
8373 /* In case the vectorization factor (VF) is bigger than the number
8374 of elements that we can fit in a vectype (nunits), we have to generate
8375 more than one vector stmt - i.e - we need to "unroll" the
8376 vector stmt by a factor VF/nunits. For more details see documentation
8377 in vectorizable_operation. */
8379 /* If the reduction is used in an outer loop we need to generate
8380 VF intermediate results, like so (e.g. for ncopies=2):
8381 r0 = phi (init, r0)
8382 r1 = phi (init, r1)
8383 r0 = x0 + r0;
8384 r1 = x1 + r1;
8385 (i.e. we generate VF results in 2 registers).
8386 In this case we have a separate def-use cycle for each copy, and therefore
8387 for each copy we get the vector def for the reduction variable from the
8388 respective phi node created for this copy.
8390 Otherwise (the reduction is unused in the loop nest), we can combine
8391 together intermediate results, like so (e.g. for ncopies=2):
8392 r = phi (init, r)
8393 r = x0 + r;
8394 r = x1 + r;
8395 (i.e. we generate VF/2 results in a single register).
8396 In this case for each copy we get the vector def for the reduction variable
8397 from the vectorized reduction operation generated in the previous iteration.
8399 This only works when we see both the reduction PHI and its only consumer
8400 in vectorizable_reduction and there are no intermediate stmts
8401 participating. When unrolling we want each unrolled iteration to have its
8402 own reduction accumulator since one of the main goals of unrolling a
8403 reduction is to reduce the aggregate loop-carried latency. */
8404 if ((ncopies > 1
8405 || (slp_node
8406 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8407 && SLP_TREE_LANES (slp_node) == 1
8408 && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
8409 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8410 && reduc_chain_length == 1
8411 && loop_vinfo->suggested_unroll_factor == 1)
8412 single_defuse_cycle = true;
8414 if (single_defuse_cycle && !lane_reducing)
8416 gcc_assert (op.code != COND_EXPR);
8418 /* 4. check support for the operation in the loop
8420 This isn't necessary for the lane reduction codes, since they
8421 can only be produced by pattern matching, and it's up to the
8422 pattern matcher to test for support. The main reason for
8423 specifically skipping this step is to avoid rechecking whether
8424 mixed-sign dot-products can be implemented using signed
8425 dot-products. */
8426 machine_mode vec_mode = TYPE_MODE (vectype_in);
8427 if (!directly_supported_p (op.code, vectype_in, optab_vector))
8429 if (dump_enabled_p ())
8430 dump_printf (MSG_NOTE, "op not supported by target.\n");
8431 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8432 || !vect_can_vectorize_without_simd_p (op.code))
8433 single_defuse_cycle = false;
8434 else
8435 if (dump_enabled_p ())
8436 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8439 if (vect_emulated_vector_p (vectype_in)
8440 && !vect_can_vectorize_without_simd_p (op.code))
8442 if (dump_enabled_p ())
8443 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8444 return false;
8447 if (dump_enabled_p () && single_defuse_cycle)
8448 dump_printf_loc (MSG_NOTE, vect_location,
8449 "using single def-use cycle for reduction by reducing "
8450 "multiple vectors to one in the loop body\n");
8451 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8453 /* For lane-reducing operation, the below processing related to single
8454 defuse-cycle will be done in its own vectorizable function. One more
8455 thing to note is that the operation must not be involved in fold-left
8456 reduction. */
8457 single_defuse_cycle &= !lane_reducing;
8459 if (slp_node
8460 && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
8461 for (i = 0; i < (int) op.num_ops; i++)
8462 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8464 if (dump_enabled_p ())
8465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8466 "incompatible vector types for invariants\n");
8467 return false;
8470 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8471 reduction_type, ncopies, cost_vec);
8472 /* Cost the reduction op inside the loop if transformed via
8473 vect_transform_reduction for non-lane-reducing operation. Otherwise
8474 this is costed by the separate vectorizable_* routines. */
8475 if (single_defuse_cycle)
8476 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
8478 if (dump_enabled_p ()
8479 && reduction_type == FOLD_LEFT_REDUCTION)
8480 dump_printf_loc (MSG_NOTE, vect_location,
8481 "using an in-order (fold-left) reduction.\n");
8482 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8484 /* All but single defuse-cycle optimized and fold-left reductions go
8485 through their own vectorizable_* routines. */
8486 if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
8488 stmt_vec_info tem
8489 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8490 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8492 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8493 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8495 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8496 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8498 else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8499 vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
8500 slp_node, op.code, op.type,
8501 vectype_in);
8502 return true;
8505 /* STMT_INFO is a dot-product reduction whose multiplication operands
8506 have different signs. Emit a sequence to emulate the operation
8507 using a series of signed DOT_PROD_EXPRs and return the last
8508 statement generated. VEC_DEST is the result of the vector operation
8509 and VOP lists its inputs. */
8511 static gassign *
8512 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8513 gimple_stmt_iterator *gsi, tree vec_dest,
8514 tree vop[3])
8516 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8517 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8518 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8519 gimple *new_stmt;
8521 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8522 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8523 std::swap (vop[0], vop[1]);
8525 /* Convert all inputs to signed types. */
8526 for (int i = 0; i < 3; ++i)
8527 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8529 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8530 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8531 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8532 vop[i] = tmp;
8535 /* In the comments below we assume 8-bit inputs for simplicity,
8536 but the approach works for any full integer type. */
8538 /* Create a vector of -128. */
8539 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8540 tree min_narrow = build_vector_from_val (narrow_vectype,
8541 min_narrow_elttype);
8543 /* Create a vector of 64. */
8544 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8545 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8546 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8548 /* Emit: SUB_RES = VOP[0] - 128. */
8549 tree sub_res = make_ssa_name (narrow_vectype);
8550 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8551 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8553 /* Emit:
8555 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8556 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8557 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8559 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8560 Doing the two 64 * y steps first allows more time to compute x. */
8561 tree stage1 = make_ssa_name (wide_vectype);
8562 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8563 vop[1], half_narrow, vop[2]);
8564 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8566 tree stage2 = make_ssa_name (wide_vectype);
8567 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8568 vop[1], half_narrow, stage1);
8569 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8571 tree stage3 = make_ssa_name (wide_vectype);
8572 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8573 sub_res, vop[1], stage2);
8574 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8576 /* Convert STAGE3 to the reduction type. */
8577 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8580 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8581 value. */
8583 bool
8584 vect_transform_reduction (loop_vec_info loop_vinfo,
8585 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8586 gimple **vec_stmt, slp_tree slp_node)
8588 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8589 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8590 unsigned ncopies;
8591 unsigned vec_num;
8593 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8594 gcc_assert (reduc_info->is_reduc_info);
8596 if (nested_in_vect_loop_p (loop, stmt_info))
8598 loop = loop->inner;
8599 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8602 gimple_match_op op;
8603 if (!gimple_extract_op (stmt_info->stmt, &op))
8604 gcc_unreachable ();
8606 /* All uses but the last are expected to be defined in the loop.
8607 The last use is the reduction variable. In case of nested cycle this
8608 assumption is not true: we use reduc_index to record the index of the
8609 reduction variable. */
8610 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8611 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8612 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8613 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
8615 if (!vectype_in)
8616 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8618 if (slp_node)
8620 ncopies = 1;
8621 vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
8623 else
8625 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8626 vec_num = 1;
8629 code_helper code = canonicalize_code (op.code, op.type);
8630 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8632 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8633 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8634 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8636 /* Transform. */
8637 tree new_temp = NULL_TREE;
8638 auto_vec<tree> vec_oprnds[3];
8640 if (dump_enabled_p ())
8641 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8643 /* FORNOW: Multiple types are not supported for condition. */
8644 if (code == COND_EXPR)
8645 gcc_assert (ncopies == 1);
8647 /* A binary COND_OP reduction must have the same definition and else
8648 value. */
8649 bool cond_fn_p = code.is_internal_fn ()
8650 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8651 if (cond_fn_p)
8653 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8654 || code == IFN_COND_MUL || code == IFN_COND_AND
8655 || code == IFN_COND_IOR || code == IFN_COND_XOR
8656 || code == IFN_COND_MIN || code == IFN_COND_MAX);
8657 gcc_assert (op.num_ops == 4
8658 && (op.ops[reduc_index]
8659 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8662 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8664 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8665 if (reduction_type == FOLD_LEFT_REDUCTION)
8667 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8668 gcc_assert (code.is_tree_code () || cond_fn_p);
8669 return vectorize_fold_left_reduction
8670 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8671 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8672 reduc_index, masks, lens);
8675 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8676 bool lane_reducing = lane_reducing_op_p (code);
8677 gcc_assert (single_defuse_cycle || lane_reducing);
8679 if (lane_reducing)
8681 /* The last operand of lane-reducing op is for reduction. */
8682 gcc_assert (reduc_index == (int) op.num_ops - 1);
8685 /* Create the destination vector */
8686 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8687 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8689 if (lane_reducing && !slp_node && !single_defuse_cycle)
8691 /* Note: there are still vectorizable cases that can not be handled by
8692 single-lane slp. Probably it would take some time to evolve the
8693 feature to a mature state. So we have to keep the below non-slp code
8694 path as failsafe for lane-reducing support. */
8695 gcc_assert (op.num_ops <= 3);
8696 for (unsigned i = 0; i < op.num_ops; i++)
8698 unsigned oprnd_ncopies = ncopies;
8700 if ((int) i == reduc_index)
8702 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8703 oprnd_ncopies = vect_get_num_copies (loop_vinfo, vectype);
8706 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, oprnd_ncopies,
8707 op.ops[i], &vec_oprnds[i]);
8710 /* Get NCOPIES vector definitions for all operands except the reduction
8711 definition. */
8712 else if (!cond_fn_p)
8714 gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8715 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8716 single_defuse_cycle && reduc_index == 0
8717 ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8718 single_defuse_cycle && reduc_index == 1
8719 ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8720 op.num_ops == 3
8721 && !(single_defuse_cycle && reduc_index == 2)
8722 ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8724 else
8726 /* For a conditional operation pass the truth type as mask
8727 vectype. */
8728 gcc_assert (single_defuse_cycle
8729 && (reduc_index == 1 || reduc_index == 2));
8730 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
8731 truth_type_for (vectype_in), &vec_oprnds[0],
8732 reduc_index == 1 ? NULL_TREE : op.ops[1],
8733 NULL_TREE, &vec_oprnds[1],
8734 reduc_index == 2 ? NULL_TREE : op.ops[2],
8735 NULL_TREE, &vec_oprnds[2]);
8738 /* For single def-use cycles get one copy of the vectorized reduction
8739 definition. */
8740 if (single_defuse_cycle)
8742 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8743 reduc_index == 0 ? op.ops[0] : NULL_TREE,
8744 &vec_oprnds[0],
8745 reduc_index == 1 ? op.ops[1] : NULL_TREE,
8746 &vec_oprnds[1],
8747 reduc_index == 2 ? op.ops[2] : NULL_TREE,
8748 &vec_oprnds[2]);
8750 else if (lane_reducing)
8752 /* For normal reduction, consistency between vectorized def/use is
8753 naturally ensured when mapping from scalar statement. But if lane-
8754 reducing op is involved in reduction, thing would become somewhat
8755 complicated in that the op's result and operand for accumulation are
8756 limited to less lanes than other operands, which certainly causes
8757 def/use mismatch on adjacent statements around the op if do not have
8758 any kind of specific adjustment. One approach is to refit lane-
8759 reducing op in the way of introducing new trivial pass-through copies
8760 to fix possible def/use gap, so as to make it behave like a normal op.
8761 And vector reduction PHIs are always generated to the full extent, no
8762 matter lane-reducing op exists or not. If some copies or PHIs are
8763 actually superfluous, they would be cleaned up by passes after
8764 vectorization. An example for single-lane slp, lane-reducing ops
8765 with mixed input vectypes in a reduction chain, is given as below.
8766 Similarly, this handling is applicable for multiple-lane slp as well.
8768 int sum = 1;
8769 for (i)
8771 sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8772 sum += w[i]; // widen-sum <vector(16) char>
8773 sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8774 sum += n[i]; // normal <vector(4) int>
8777 The vector size is 128-bit,vectorization factor is 16. Reduction
8778 statements would be transformed as:
8780 vector<4> int sum_v0 = { 0, 0, 0, 1 };
8781 vector<4> int sum_v1 = { 0, 0, 0, 0 };
8782 vector<4> int sum_v2 = { 0, 0, 0, 0 };
8783 vector<4> int sum_v3 = { 0, 0, 0, 0 };
8785 for (i / 16)
8787 sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8788 sum_v1 = sum_v1; // copy
8789 sum_v2 = sum_v2; // copy
8790 sum_v3 = sum_v3; // copy
8792 sum_v0 = sum_v0; // copy
8793 sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8794 sum_v2 = sum_v2; // copy
8795 sum_v3 = sum_v3; // copy
8797 sum_v0 = sum_v0; // copy
8798 sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8799 sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8800 sum_v3 = sum_v3; // copy
8802 sum_v0 += n_v0[i: 0 ~ 3 ];
8803 sum_v1 += n_v1[i: 4 ~ 7 ];
8804 sum_v2 += n_v2[i: 8 ~ 11];
8805 sum_v3 += n_v3[i: 12 ~ 15];
8808 Moreover, for a higher instruction parallelism in final vectorized
8809 loop, it is considered to make those effective vector lane-reducing
8810 ops be distributed evenly among all def-use cycles. In the above
8811 example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8812 cycles, instruction dependency among them could be eliminated. */
8813 unsigned effec_ncopies = vec_oprnds[0].length ();
8814 unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8816 gcc_assert (effec_ncopies <= total_ncopies);
8818 if (effec_ncopies < total_ncopies)
8820 for (unsigned i = 0; i < op.num_ops - 1; i++)
8822 gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8823 vec_oprnds[i].safe_grow_cleared (total_ncopies);
8827 tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8828 gcc_assert (reduc_vectype_in);
8830 unsigned effec_reduc_ncopies
8831 = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
8833 gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8835 if (effec_ncopies < effec_reduc_ncopies)
8837 /* Find suitable def-use cycles to generate vectorized statements
8838 into, and reorder operands based on the selection. */
8839 unsigned curr_pos = reduc_info->reduc_result_pos;
8840 unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8842 gcc_assert (curr_pos < effec_reduc_ncopies);
8843 reduc_info->reduc_result_pos = next_pos;
8845 if (curr_pos)
8847 unsigned count = effec_reduc_ncopies - effec_ncopies;
8848 unsigned start = curr_pos - count;
8850 if ((int) start < 0)
8852 count = curr_pos;
8853 start = 0;
8856 for (unsigned i = 0; i < op.num_ops - 1; i++)
8858 for (unsigned j = effec_ncopies; j > start; j--)
8860 unsigned k = j - 1;
8861 std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8862 gcc_assert (!vec_oprnds[i][k]);
8869 bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
8870 unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8872 for (unsigned i = 0; i < num; ++i)
8874 gimple *new_stmt;
8875 tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8876 if (!vop[0] || !vop[1])
8878 tree reduc_vop = vec_oprnds[reduc_index][i];
8880 /* If could not generate an effective vector statement for current
8881 portion of reduction operand, insert a trivial copy to simply
8882 handle over the operand to other dependent statements. */
8883 gcc_assert (reduc_vop);
8885 if (slp_node && TREE_CODE (reduc_vop) == SSA_NAME
8886 && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8887 new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8888 else
8890 new_temp = make_ssa_name (vec_dest);
8891 new_stmt = gimple_build_assign (new_temp, reduc_vop);
8892 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8893 gsi);
8896 else if (masked_loop_p && !mask_by_cond_expr)
8898 /* No conditional ifns have been defined for lane-reducing op
8899 yet. */
8900 gcc_assert (!lane_reducing);
8902 /* Make sure that the reduction accumulator is vop[0]. */
8903 if (reduc_index == 1)
8905 gcc_assert (commutative_binary_op_p (code, op.type));
8906 std::swap (vop[0], vop[1]);
8908 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8909 vec_num * ncopies, vectype_in, i);
8910 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8911 vop[0], vop[1], vop[0]);
8912 new_temp = make_ssa_name (vec_dest, call);
8913 gimple_call_set_lhs (call, new_temp);
8914 gimple_call_set_nothrow (call, true);
8915 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8916 new_stmt = call;
8918 else
8920 if (op.num_ops >= 3)
8921 vop[2] = vec_oprnds[2][i];
8923 if (masked_loop_p && mask_by_cond_expr)
8925 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8926 vec_num * ncopies, vectype_in, i);
8927 build_vect_cond_expr (code, vop, mask, gsi);
8930 if (emulated_mixed_dot_prod)
8931 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8932 vec_dest, vop);
8934 else if (code.is_internal_fn () && !cond_fn_p)
8935 new_stmt = gimple_build_call_internal (internal_fn (code),
8936 op.num_ops,
8937 vop[0], vop[1], vop[2]);
8938 else if (code.is_internal_fn () && cond_fn_p)
8939 new_stmt = gimple_build_call_internal (internal_fn (code),
8940 op.num_ops,
8941 vop[0], vop[1], vop[2],
8942 vop[1]);
8943 else
8944 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8945 vop[0], vop[1], vop[2]);
8946 new_temp = make_ssa_name (vec_dest, new_stmt);
8947 gimple_set_lhs (new_stmt, new_temp);
8948 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8951 if (single_defuse_cycle && i < num - 1)
8952 vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8953 else if (slp_node)
8954 slp_node->push_vec_def (new_stmt);
8955 else
8956 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8959 if (!slp_node)
8960 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8962 return true;
8965 /* Transform phase of a cycle PHI. */
8967 bool
8968 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8969 stmt_vec_info stmt_info, gimple **vec_stmt,
8970 slp_tree slp_node, slp_instance slp_node_instance)
8972 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8973 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8974 int i;
8975 int ncopies;
8976 int j;
8977 bool nested_cycle = false;
8978 int vec_num;
8980 if (nested_in_vect_loop_p (loop, stmt_info))
8982 loop = loop->inner;
8983 nested_cycle = true;
8986 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8987 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8988 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8989 gcc_assert (reduc_info->is_reduc_info);
8991 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8992 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8993 /* Leave the scalar phi in place. */
8994 return true;
8996 if (slp_node)
8998 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8999 ncopies = 1;
9001 else
9003 vec_num = 1;
9004 ncopies = vect_get_num_copies (loop_vinfo,
9005 STMT_VINFO_VECTYPE (stmt_info));
9008 /* Check whether we should use a single PHI node and accumulate
9009 vectors to one before the backedge. */
9010 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
9012 ncopies = 1;
9013 vec_num = 1;
9016 /* Create the destination vector */
9017 gphi *phi = as_a <gphi *> (stmt_info->stmt);
9018 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
9019 vectype_out);
9021 /* Get the loop-entry arguments. */
9022 tree vec_initial_def = NULL_TREE;
9023 auto_vec<tree> vec_initial_defs;
9024 if (slp_node)
9026 vec_initial_defs.reserve (vec_num);
9027 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9028 and we can't use zero for induc_val, use initial_def. Similarly
9029 for REDUC_MIN and initial_def larger than the base. */
9030 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9032 gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9033 tree initial_def = vect_phi_initial_value (phi);
9034 reduc_info->reduc_initial_values.safe_push (initial_def);
9035 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9036 if (TREE_CODE (initial_def) == INTEGER_CST
9037 && !integer_zerop (induc_val)
9038 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9039 && tree_int_cst_lt (initial_def, induc_val))
9040 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9041 && tree_int_cst_lt (induc_val, initial_def))))
9043 induc_val = initial_def;
9044 /* Communicate we used the initial_def to epilouge
9045 generation. */
9046 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9048 vec_initial_defs.quick_push
9049 (build_vector_from_val (vectype_out, induc_val));
9051 else if (nested_cycle)
9053 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
9054 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
9055 &vec_initial_defs);
9057 else
9059 gcc_assert (slp_node == slp_node_instance->reduc_phis);
9060 vec<tree> &initial_values = reduc_info->reduc_initial_values;
9061 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
9063 unsigned int num_phis = stmts.length ();
9064 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
9065 num_phis = 1;
9066 initial_values.reserve (num_phis);
9067 for (unsigned int i = 0; i < num_phis; ++i)
9069 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
9070 initial_values.quick_push (vect_phi_initial_value (this_phi));
9072 if (vec_num == 1)
9073 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9074 if (!initial_values.is_empty ())
9076 tree initial_value
9077 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
9078 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9079 tree neutral_op
9080 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
9081 code, initial_value);
9082 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
9083 &vec_initial_defs, vec_num,
9084 stmts.length (), neutral_op);
9088 else
9090 /* Get at the scalar def before the loop, that defines the initial
9091 value of the reduction variable. */
9092 tree initial_def = vect_phi_initial_value (phi);
9093 reduc_info->reduc_initial_values.safe_push (initial_def);
9094 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9095 and we can't use zero for induc_val, use initial_def. Similarly
9096 for REDUC_MIN and initial_def larger than the base. */
9097 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9099 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9100 if (TREE_CODE (initial_def) == INTEGER_CST
9101 && !integer_zerop (induc_val)
9102 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9103 && tree_int_cst_lt (initial_def, induc_val))
9104 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9105 && tree_int_cst_lt (induc_val, initial_def))))
9107 induc_val = initial_def;
9108 /* Communicate we used the initial_def to epilouge
9109 generation. */
9110 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9112 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
9114 else if (nested_cycle)
9116 /* Do not use an adjustment def as that case is not supported
9117 correctly if ncopies is not one. */
9118 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
9119 ncopies, initial_def,
9120 &vec_initial_defs);
9122 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
9123 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
9124 /* Fill the initial vector with the initial scalar value. */
9125 vec_initial_def
9126 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
9127 initial_def, initial_def);
9128 else
9130 if (ncopies == 1)
9131 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9132 if (!reduc_info->reduc_initial_values.is_empty ())
9134 initial_def = reduc_info->reduc_initial_values[0];
9135 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9136 tree neutral_op
9137 = neutral_op_for_reduction (TREE_TYPE (initial_def),
9138 code, initial_def);
9139 gcc_assert (neutral_op);
9140 /* Try to simplify the vector initialization by applying an
9141 adjustment after the reduction has been performed. */
9142 if (!reduc_info->reused_accumulator
9143 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9144 && !operand_equal_p (neutral_op, initial_def))
9146 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
9147 = initial_def;
9148 initial_def = neutral_op;
9150 vec_initial_def
9151 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
9152 initial_def, neutral_op);
9157 if (vec_initial_def)
9159 vec_initial_defs.create (ncopies);
9160 for (i = 0; i < ncopies; ++i)
9161 vec_initial_defs.quick_push (vec_initial_def);
9164 if (auto *accumulator = reduc_info->reused_accumulator)
9166 tree def = accumulator->reduc_input;
9167 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9169 unsigned int nreduc;
9170 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
9171 (TREE_TYPE (def)),
9172 TYPE_VECTOR_SUBPARTS (vectype_out),
9173 &nreduc);
9174 gcc_assert (res);
9175 gimple_seq stmts = NULL;
9176 /* Reduce the single vector to a smaller one. */
9177 if (nreduc != 1)
9179 /* Perform the reduction in the appropriate type. */
9180 tree rvectype = vectype_out;
9181 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
9182 TREE_TYPE (TREE_TYPE (def))))
9183 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
9184 TYPE_VECTOR_SUBPARTS
9185 (vectype_out));
9186 def = vect_create_partial_epilog (def, rvectype,
9187 STMT_VINFO_REDUC_CODE
9188 (reduc_info),
9189 &stmts);
9191 /* The epilogue loop might use a different vector mode, like
9192 VNx2DI vs. V2DI. */
9193 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
9195 tree reduc_type = build_vector_type_for_mode
9196 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
9197 def = gimple_convert (&stmts, reduc_type, def);
9199 /* Adjust the input so we pick up the partially reduced value
9200 for the skip edge in vect_create_epilog_for_reduction. */
9201 accumulator->reduc_input = def;
9202 /* And the reduction could be carried out using a different sign. */
9203 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9204 def = gimple_convert (&stmts, vectype_out, def);
9205 edge e;
9206 if ((e = loop_vinfo->main_loop_edge)
9207 || (e = loop_vinfo->skip_this_loop_edge))
9209 /* While we'd like to insert on the edge this will split
9210 blocks and disturb bookkeeping, we also will eventually
9211 need this on the skip edge. Rely on sinking to
9212 fixup optimal placement and insert in the pred. */
9213 gimple_stmt_iterator gsi = gsi_last_bb (e->src);
9214 /* Insert before a cond that eventually skips the
9215 epilogue. */
9216 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
9217 gsi_prev (&gsi);
9218 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
9220 else
9221 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
9222 stmts);
9224 if (loop_vinfo->main_loop_edge)
9225 vec_initial_defs[0]
9226 = vect_get_main_loop_result (loop_vinfo, def,
9227 vec_initial_defs[0]);
9228 else
9229 vec_initial_defs.safe_push (def);
9232 /* Generate the reduction PHIs upfront. */
9233 for (i = 0; i < vec_num; i++)
9235 tree vec_init_def = vec_initial_defs[i];
9236 for (j = 0; j < ncopies; j++)
9238 /* Create the reduction-phi that defines the reduction
9239 operand. */
9240 gphi *new_phi = create_phi_node (vec_dest, loop->header);
9242 /* Set the loop-entry arg of the reduction-phi. */
9243 if (j != 0 && nested_cycle)
9244 vec_init_def = vec_initial_defs[j];
9245 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
9246 UNKNOWN_LOCATION);
9248 /* The loop-latch arg is set in epilogue processing. */
9250 if (slp_node)
9251 slp_node->push_vec_def (new_phi);
9252 else
9254 if (j == 0)
9255 *vec_stmt = new_phi;
9256 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9261 return true;
9264 /* Vectorizes LC PHIs. */
9266 bool
9267 vectorizable_lc_phi (loop_vec_info loop_vinfo,
9268 stmt_vec_info stmt_info, gimple **vec_stmt,
9269 slp_tree slp_node)
9271 if (!loop_vinfo
9272 || !is_a <gphi *> (stmt_info->stmt)
9273 || gimple_phi_num_args (stmt_info->stmt) != 1)
9274 return false;
9276 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9277 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9278 return false;
9280 if (!vec_stmt) /* transformation not required. */
9282 /* Deal with copies from externs or constants that disguise as
9283 loop-closed PHI nodes (PR97886). */
9284 if (slp_node
9285 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9286 SLP_TREE_VECTYPE (slp_node)))
9288 if (dump_enabled_p ())
9289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9290 "incompatible vector types for invariants\n");
9291 return false;
9293 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9294 return true;
9297 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9298 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9299 basic_block bb = gimple_bb (stmt_info->stmt);
9300 edge e = single_pred_edge (bb);
9301 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9302 auto_vec<tree> vec_oprnds;
9303 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9304 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9305 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9306 for (unsigned i = 0; i < vec_oprnds.length (); i++)
9308 /* Create the vectorized LC PHI node. */
9309 gphi *new_phi = create_phi_node (vec_dest, bb);
9310 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9311 if (slp_node)
9312 slp_node->push_vec_def (new_phi);
9313 else
9314 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9316 if (!slp_node)
9317 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9319 return true;
9322 /* Vectorizes PHIs. */
9324 bool
9325 vectorizable_phi (vec_info *,
9326 stmt_vec_info stmt_info, gimple **vec_stmt,
9327 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9329 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9330 return false;
9332 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9333 return false;
9335 tree vectype = SLP_TREE_VECTYPE (slp_node);
9337 if (!vec_stmt) /* transformation not required. */
9339 slp_tree child;
9340 unsigned i;
9341 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9342 if (!child)
9344 if (dump_enabled_p ())
9345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9346 "PHI node with unvectorized backedge def\n");
9347 return false;
9349 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9351 if (dump_enabled_p ())
9352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9353 "incompatible vector types for invariants\n");
9354 return false;
9356 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9357 && !useless_type_conversion_p (vectype,
9358 SLP_TREE_VECTYPE (child)))
9360 /* With bools we can have mask and non-mask precision vectors
9361 or different non-mask precisions. while pattern recog is
9362 supposed to guarantee consistency here bugs in it can cause
9363 mismatches (PR103489 and PR103800 for example).
9364 Deal with them here instead of ICEing later. */
9365 if (dump_enabled_p ())
9366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9367 "incompatible vector type setup from "
9368 "bool pattern detection\n");
9369 return false;
9372 /* For single-argument PHIs assume coalescing which means zero cost
9373 for the scalar and the vector PHIs. This avoids artificially
9374 favoring the vector path (but may pessimize it in some cases). */
9375 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9376 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9377 vector_stmt, stmt_info, vectype, 0, vect_body);
9378 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9379 return true;
9382 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9383 basic_block bb = gimple_bb (stmt_info->stmt);
9384 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9385 auto_vec<gphi *> new_phis;
9386 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9388 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9390 /* Skip not yet vectorized defs. */
9391 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9392 && SLP_TREE_VEC_DEFS (child).is_empty ())
9393 continue;
9395 auto_vec<tree> vec_oprnds;
9396 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9397 if (!new_phis.exists ())
9399 new_phis.create (vec_oprnds.length ());
9400 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9402 /* Create the vectorized LC PHI node. */
9403 new_phis.quick_push (create_phi_node (vec_dest, bb));
9404 slp_node->push_vec_def (new_phis[j]);
9407 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9408 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9409 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9411 /* We should have at least one already vectorized child. */
9412 gcc_assert (new_phis.exists ());
9414 return true;
9417 /* Vectorizes first order recurrences. An overview of the transformation
9418 is described below. Suppose we have the following loop.
9420 int t = 0;
9421 for (int i = 0; i < n; ++i)
9423 b[i] = a[i] - t;
9424 t = a[i];
9427 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9428 looks (simplified) like:
9430 scalar.preheader:
9431 init = 0;
9433 scalar.body:
9434 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9435 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9436 _1 = a[i]
9437 b[i] = _1 - _2
9438 if (i < n) goto scalar.body
9440 In this example, _2 is a recurrence because it's value depends on the
9441 previous iteration. We vectorize this as (VF = 4)
9443 vector.preheader:
9444 vect_init = vect_cst(..., ..., ..., 0)
9446 vector.body
9447 i = PHI <0(vector.preheader), i+4(vector.body)>
9448 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9449 vect_2 = a[i, i+1, i+2, i+3];
9450 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9451 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9452 if (..) goto vector.body
9454 In this function, vectorizable_recurr, we code generate both the
9455 vector PHI node and the permute since those together compute the
9456 vectorized value of the scalar PHI. We do not yet have the
9457 backedge value to fill in there nor into the vec_perm. Those
9458 are filled in maybe_set_vectorized_backedge_value and
9459 vect_schedule_scc.
9461 TODO: Since the scalar loop does not have a use of the recurrence
9462 outside of the loop the natural way to implement peeling via
9463 vectorizing the live value doesn't work. For now peeling of loops
9464 with a recurrence is not implemented. For SLP the supported cases
9465 are restricted to those requiring a single vector recurrence PHI. */
9467 bool
9468 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9469 gimple **vec_stmt, slp_tree slp_node,
9470 stmt_vector_for_cost *cost_vec)
9472 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9473 return false;
9475 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9477 /* So far we only support first-order recurrence auto-vectorization. */
9478 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9479 return false;
9481 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9482 unsigned ncopies;
9483 if (slp_node)
9484 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9485 else
9486 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9487 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9488 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9489 /* We need to be able to make progress with a single vector. */
9490 if (maybe_gt (dist * 2, nunits))
9492 if (dump_enabled_p ())
9493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9494 "first order recurrence exceeds half of "
9495 "a vector\n");
9496 return false;
9499 /* First-order recurrence autovectorization needs to handle permutation
9500 with indices = [nunits-1, nunits, nunits+1, ...]. */
9501 vec_perm_builder sel (nunits, 1, 3);
9502 for (int i = 0; i < 3; ++i)
9503 sel.quick_push (nunits - dist + i);
9504 vec_perm_indices indices (sel, 2, nunits);
9506 if (!vec_stmt) /* transformation not required. */
9508 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9509 indices))
9510 return false;
9512 if (slp_node)
9514 /* We eventually need to set a vector type on invariant
9515 arguments. */
9516 unsigned j;
9517 slp_tree child;
9518 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9519 if (!vect_maybe_update_slp_op_vectype
9520 (child, SLP_TREE_VECTYPE (slp_node)))
9522 if (dump_enabled_p ())
9523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9524 "incompatible vector types for "
9525 "invariants\n");
9526 return false;
9530 /* Verify we have set up compatible types. */
9531 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9532 tree latch_vectype = NULL_TREE;
9533 if (slp_node)
9535 slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9536 latch_vectype = SLP_TREE_VECTYPE (latch_def);
9538 else
9540 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9541 if (TREE_CODE (latch_def) == SSA_NAME)
9543 stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9544 latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9545 latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9548 if (!types_compatible_p (latch_vectype, vectype))
9549 return false;
9551 /* The recurrence costs the initialization vector and one permute
9552 for each copy. */
9553 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9554 stmt_info, 0, vect_prologue);
9555 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9556 stmt_info, 0, vect_body);
9557 if (dump_enabled_p ())
9558 dump_printf_loc (MSG_NOTE, vect_location,
9559 "vectorizable_recurr: inside_cost = %d, "
9560 "prologue_cost = %d .\n", inside_cost,
9561 prologue_cost);
9563 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9564 return true;
9567 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9568 basic_block bb = gimple_bb (phi);
9569 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9570 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9572 gimple_seq stmts = NULL;
9573 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9574 gsi_insert_seq_on_edge_immediate (pe, stmts);
9576 tree vec_init = build_vector_from_val (vectype, preheader);
9577 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9579 /* Create the vectorized first-order PHI node. */
9580 tree vec_dest = vect_get_new_vect_var (vectype,
9581 vect_simple_var, "vec_recur_");
9582 gphi *new_phi = create_phi_node (vec_dest, bb);
9583 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9585 /* Insert shuffles the first-order recurrence autovectorization.
9586 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9587 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9589 /* Insert the required permute after the latch definition. The
9590 second and later operands are tentative and will be updated when we have
9591 vectorized the latch definition. */
9592 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9593 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9594 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9595 gsi_next (&gsi2);
9597 for (unsigned i = 0; i < ncopies; ++i)
9599 vec_dest = make_ssa_name (vectype);
9600 gassign *vperm
9601 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9602 i == 0 ? gimple_phi_result (new_phi) : NULL,
9603 NULL, perm);
9604 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9606 if (slp_node)
9607 slp_node->push_vec_def (vperm);
9608 else
9609 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9612 if (!slp_node)
9613 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9614 return true;
9617 /* Return true if VECTYPE represents a vector that requires lowering
9618 by the vector lowering pass. */
9620 bool
9621 vect_emulated_vector_p (tree vectype)
9623 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9624 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9625 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9628 /* Return true if we can emulate CODE on an integer mode representation
9629 of a vector. */
9631 bool
9632 vect_can_vectorize_without_simd_p (tree_code code)
9634 switch (code)
9636 case PLUS_EXPR:
9637 case MINUS_EXPR:
9638 case NEGATE_EXPR:
9639 case BIT_AND_EXPR:
9640 case BIT_IOR_EXPR:
9641 case BIT_XOR_EXPR:
9642 case BIT_NOT_EXPR:
9643 return true;
9645 default:
9646 return false;
9650 /* Likewise, but taking a code_helper. */
9652 bool
9653 vect_can_vectorize_without_simd_p (code_helper code)
9655 return (code.is_tree_code ()
9656 && vect_can_vectorize_without_simd_p (tree_code (code)));
9659 /* Create vector init for vectorized iv. */
9660 static tree
9661 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9662 tree step_expr, poly_uint64 nunits,
9663 tree vectype,
9664 enum vect_induction_op_type induction_type)
9666 unsigned HOST_WIDE_INT const_nunits;
9667 tree vec_shift, vec_init, new_name;
9668 unsigned i;
9669 tree itype = TREE_TYPE (vectype);
9671 /* iv_loop is the loop to be vectorized. Create:
9672 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9673 new_name = gimple_convert (stmts, itype, init_expr);
9674 switch (induction_type)
9676 case vect_step_op_shr:
9677 case vect_step_op_shl:
9678 /* Build the Initial value from shift_expr. */
9679 vec_init = gimple_build_vector_from_val (stmts,
9680 vectype,
9681 new_name);
9682 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9683 build_zero_cst (itype), step_expr);
9684 vec_init = gimple_build (stmts,
9685 (induction_type == vect_step_op_shr
9686 ? RSHIFT_EXPR : LSHIFT_EXPR),
9687 vectype, vec_init, vec_shift);
9688 break;
9690 case vect_step_op_neg:
9692 vec_init = gimple_build_vector_from_val (stmts,
9693 vectype,
9694 new_name);
9695 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9696 vectype, vec_init);
9697 /* The encoding has 2 interleaved stepped patterns. */
9698 vec_perm_builder sel (nunits, 2, 3);
9699 sel.quick_grow (6);
9700 for (i = 0; i < 3; i++)
9702 sel[2 * i] = i;
9703 sel[2 * i + 1] = i + nunits;
9705 vec_perm_indices indices (sel, 2, nunits);
9706 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9707 fail when vec_init is const vector. In that situation vec_perm is not
9708 really needed. */
9709 tree perm_mask_even
9710 = vect_gen_perm_mask_any (vectype, indices);
9711 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9712 vectype,
9713 vec_init, vec_neg,
9714 perm_mask_even);
9716 break;
9718 case vect_step_op_mul:
9720 /* Use unsigned mult to avoid UD integer overflow. */
9721 gcc_assert (nunits.is_constant (&const_nunits));
9722 tree utype = unsigned_type_for (itype);
9723 tree uvectype = build_vector_type (utype,
9724 TYPE_VECTOR_SUBPARTS (vectype));
9725 new_name = gimple_convert (stmts, utype, new_name);
9726 vec_init = gimple_build_vector_from_val (stmts,
9727 uvectype,
9728 new_name);
9729 tree_vector_builder elts (uvectype, const_nunits, 1);
9730 tree elt_step = build_one_cst (utype);
9732 elts.quick_push (elt_step);
9733 for (i = 1; i < const_nunits; i++)
9735 /* Create: new_name_i = new_name + step_expr. */
9736 elt_step = gimple_build (stmts, MULT_EXPR,
9737 utype, elt_step, step_expr);
9738 elts.quick_push (elt_step);
9740 /* Create a vector from [new_name_0, new_name_1, ...,
9741 new_name_nunits-1]. */
9742 tree vec_mul = gimple_build_vector (stmts, &elts);
9743 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9744 vec_init, vec_mul);
9745 vec_init = gimple_convert (stmts, vectype, vec_init);
9747 break;
9749 default:
9750 gcc_unreachable ();
9753 return vec_init;
9756 /* Peel init_expr by skip_niter for induction_type. */
9757 tree
9758 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9759 tree skip_niters, tree step_expr,
9760 enum vect_induction_op_type induction_type)
9762 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9763 tree type = TREE_TYPE (init_expr);
9764 unsigned prec = TYPE_PRECISION (type);
9765 switch (induction_type)
9767 case vect_step_op_neg:
9768 if (TREE_INT_CST_LOW (skip_niters) % 2)
9769 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9770 /* else no change. */
9771 break;
9773 case vect_step_op_shr:
9774 case vect_step_op_shl:
9775 skip_niters = gimple_convert (stmts, type, skip_niters);
9776 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9777 /* When shift mount >= precision, need to avoid UD.
9778 In the original loop, there's no UD, and according to semantic,
9779 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9780 if (!tree_fits_uhwi_p (step_expr)
9781 || tree_to_uhwi (step_expr) >= prec)
9783 if (induction_type == vect_step_op_shl
9784 || TYPE_UNSIGNED (type))
9785 init_expr = build_zero_cst (type);
9786 else
9787 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9788 init_expr,
9789 wide_int_to_tree (type, prec - 1));
9791 else
9792 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9793 ? RSHIFT_EXPR : LSHIFT_EXPR),
9794 type, init_expr, step_expr);
9795 break;
9797 case vect_step_op_mul:
9799 tree utype = unsigned_type_for (type);
9800 init_expr = gimple_convert (stmts, utype, init_expr);
9801 wide_int skipn = wi::to_wide (skip_niters);
9802 wide_int begin = wi::to_wide (step_expr);
9803 auto_mpz base, exp, mod, res;
9804 wi::to_mpz (begin, base, TYPE_SIGN (type));
9805 wi::to_mpz (skipn, exp, UNSIGNED);
9806 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9807 mpz_powm (res, base, exp, mod);
9808 begin = wi::from_mpz (utype, res, true);
9809 tree mult_expr = wide_int_to_tree (utype, begin);
9810 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9811 init_expr, mult_expr);
9812 init_expr = gimple_convert (stmts, type, init_expr);
9814 break;
9816 default:
9817 gcc_unreachable ();
9820 return init_expr;
9823 /* Create vector step for vectorized iv. */
9824 static tree
9825 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9826 poly_uint64 vf,
9827 enum vect_induction_op_type induction_type)
9829 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9830 tree new_name = NULL;
9831 /* Step should be pow (step, vf) for mult induction. */
9832 if (induction_type == vect_step_op_mul)
9834 gcc_assert (vf.is_constant ());
9835 wide_int begin = wi::to_wide (step_expr);
9837 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9838 begin = wi::mul (begin, wi::to_wide (step_expr));
9840 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9842 else if (induction_type == vect_step_op_neg)
9843 /* Do nothing. */
9845 else
9846 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9847 expr, step_expr);
9848 return new_name;
9851 static tree
9852 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9853 stmt_vec_info stmt_info,
9854 tree new_name, tree vectype,
9855 enum vect_induction_op_type induction_type)
9857 /* No step is needed for neg induction. */
9858 if (induction_type == vect_step_op_neg)
9859 return NULL;
9861 tree t = unshare_expr (new_name);
9862 gcc_assert (CONSTANT_CLASS_P (new_name)
9863 || TREE_CODE (new_name) == SSA_NAME);
9864 tree new_vec = build_vector_from_val (vectype, t);
9865 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9866 new_vec, vectype, NULL);
9867 return vec_step;
9870 /* Update vectorized iv with vect_step, induc_def is init. */
9871 static tree
9872 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9873 tree induc_def, tree vec_step,
9874 enum vect_induction_op_type induction_type)
9876 tree vec_def = induc_def;
9877 switch (induction_type)
9879 case vect_step_op_mul:
9881 /* Use unsigned mult to avoid UD integer overflow. */
9882 tree uvectype
9883 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9884 TYPE_VECTOR_SUBPARTS (vectype));
9885 vec_def = gimple_convert (stmts, uvectype, vec_def);
9886 vec_step = gimple_convert (stmts, uvectype, vec_step);
9887 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9888 vec_def, vec_step);
9889 vec_def = gimple_convert (stmts, vectype, vec_def);
9891 break;
9893 case vect_step_op_shr:
9894 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9895 vec_def, vec_step);
9896 break;
9898 case vect_step_op_shl:
9899 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9900 vec_def, vec_step);
9901 break;
9902 case vect_step_op_neg:
9903 vec_def = induc_def;
9904 /* Do nothing. */
9905 break;
9906 default:
9907 gcc_unreachable ();
9910 return vec_def;
9914 /* Function vectorizable_induction
9916 Check if STMT_INFO performs an nonlinear induction computation that can be
9917 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9918 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9919 basic block.
9920 Return true if STMT_INFO is vectorizable in this way. */
9922 static bool
9923 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9924 stmt_vec_info stmt_info,
9925 gimple **vec_stmt, slp_tree slp_node,
9926 stmt_vector_for_cost *cost_vec)
9928 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9929 unsigned ncopies;
9930 bool nested_in_vect_loop = false;
9931 class loop *iv_loop;
9932 tree vec_def;
9933 edge pe = loop_preheader_edge (loop);
9934 basic_block new_bb;
9935 tree vec_init, vec_step;
9936 tree new_name;
9937 gimple *new_stmt;
9938 gphi *induction_phi;
9939 tree induc_def, vec_dest;
9940 tree init_expr, step_expr;
9941 tree niters_skip;
9942 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9943 unsigned i;
9944 gimple_stmt_iterator si;
9946 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9949 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9950 enum vect_induction_op_type induction_type
9951 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9953 gcc_assert (induction_type > vect_step_op_add);
9955 if (slp_node)
9956 ncopies = 1;
9957 else
9958 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9959 gcc_assert (ncopies >= 1);
9961 /* FORNOW. Only handle nonlinear induction in the same loop. */
9962 if (nested_in_vect_loop_p (loop, stmt_info))
9964 if (dump_enabled_p ())
9965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9966 "nonlinear induction in nested loop.\n");
9967 return false;
9970 iv_loop = loop;
9971 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9973 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9974 update for each iv and a permutation to generate wanted vector iv. */
9975 if (slp_node)
9977 if (dump_enabled_p ())
9978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9979 "SLP induction not supported for nonlinear"
9980 " induction.\n");
9981 return false;
9984 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9986 if (dump_enabled_p ())
9987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9988 "floating point nonlinear induction vectorization"
9989 " not supported.\n");
9990 return false;
9993 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9994 init_expr = vect_phi_initial_value (phi);
9995 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9996 && TREE_CODE (step_expr) == INTEGER_CST);
9997 /* step_expr should be aligned with init_expr,
9998 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9999 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
10001 if (TREE_CODE (init_expr) == INTEGER_CST)
10002 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
10003 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
10005 /* INIT_EXPR could be a bit_field, bail out for such case. */
10006 if (dump_enabled_p ())
10007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10008 "nonlinear induction vectorization failed:"
10009 " component type of vectype is not a nop conversion"
10010 " from type of init_expr.\n");
10011 return false;
10014 switch (induction_type)
10016 case vect_step_op_neg:
10017 if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
10018 return false;
10019 if (TREE_CODE (init_expr) != INTEGER_CST
10020 && TREE_CODE (init_expr) != REAL_CST)
10022 /* Check for backend support of NEGATE_EXPR and vec_perm. */
10023 if (!directly_supported_p (NEGATE_EXPR, vectype))
10024 return false;
10026 /* The encoding has 2 interleaved stepped patterns. */
10027 vec_perm_builder sel (nunits, 2, 3);
10028 machine_mode mode = TYPE_MODE (vectype);
10029 sel.quick_grow (6);
10030 for (i = 0; i < 3; i++)
10032 sel[i * 2] = i;
10033 sel[i * 2 + 1] = i + nunits;
10035 vec_perm_indices indices (sel, 2, nunits);
10036 if (!can_vec_perm_const_p (mode, mode, indices))
10037 return false;
10039 break;
10041 case vect_step_op_mul:
10043 /* Check for backend support of MULT_EXPR. */
10044 if (!directly_supported_p (MULT_EXPR, vectype))
10045 return false;
10047 /* ?? How to construct vector step for variable number vector.
10048 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
10049 if (!vf.is_constant ())
10050 return false;
10052 break;
10054 case vect_step_op_shr:
10055 /* Check for backend support of RSHIFT_EXPR. */
10056 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
10057 return false;
10059 /* Don't shift more than type precision to avoid UD. */
10060 if (!tree_fits_uhwi_p (step_expr)
10061 || maybe_ge (nunits * tree_to_uhwi (step_expr),
10062 TYPE_PRECISION (TREE_TYPE (init_expr))))
10063 return false;
10064 break;
10066 case vect_step_op_shl:
10067 /* Check for backend support of RSHIFT_EXPR. */
10068 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
10069 return false;
10071 /* Don't shift more than type precision to avoid UD. */
10072 if (!tree_fits_uhwi_p (step_expr)
10073 || maybe_ge (nunits * tree_to_uhwi (step_expr),
10074 TYPE_PRECISION (TREE_TYPE (init_expr))))
10075 return false;
10077 break;
10079 default:
10080 gcc_unreachable ();
10083 if (!vec_stmt) /* transformation not required. */
10085 unsigned inside_cost = 0, prologue_cost = 0;
10086 /* loop cost for vec_loop. Neg induction doesn't have any
10087 inside_cost. */
10088 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10089 stmt_info, 0, vect_body);
10091 /* loop cost for vec_loop. Neg induction doesn't have any
10092 inside_cost. */
10093 if (induction_type == vect_step_op_neg)
10094 inside_cost = 0;
10096 /* prologue cost for vec_init and vec_step. */
10097 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10098 stmt_info, 0, vect_prologue);
10100 if (dump_enabled_p ())
10101 dump_printf_loc (MSG_NOTE, vect_location,
10102 "vect_model_induction_cost: inside_cost = %d, "
10103 "prologue_cost = %d. \n", inside_cost,
10104 prologue_cost);
10106 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10107 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
10108 return true;
10111 /* Transform. */
10113 /* Compute a vector variable, initialized with the first VF values of
10114 the induction variable. E.g., for an iv with IV_PHI='X' and
10115 evolution S, for a vector of 4 units, we want to compute:
10116 [X, X + S, X + 2*S, X + 3*S]. */
10118 if (dump_enabled_p ())
10119 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10121 pe = loop_preheader_edge (iv_loop);
10122 /* Find the first insertion point in the BB. */
10123 basic_block bb = gimple_bb (phi);
10124 si = gsi_after_labels (bb);
10126 gimple_seq stmts = NULL;
10128 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10129 /* If we are using the loop mask to "peel" for alignment then we need
10130 to adjust the start value here. */
10131 if (niters_skip != NULL_TREE)
10132 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
10133 step_expr, induction_type);
10135 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
10136 step_expr, nunits, vectype,
10137 induction_type);
10138 if (stmts)
10140 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10141 gcc_assert (!new_bb);
10144 stmts = NULL;
10145 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10146 vf, induction_type);
10147 if (stmts)
10149 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10150 gcc_assert (!new_bb);
10153 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10154 new_name, vectype,
10155 induction_type);
10156 /* Create the following def-use cycle:
10157 loop prolog:
10158 vec_init = ...
10159 vec_step = ...
10160 loop:
10161 vec_iv = PHI <vec_init, vec_loop>
10163 STMT
10165 vec_loop = vec_iv + vec_step; */
10167 /* Create the induction-phi that defines the induction-operand. */
10168 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10169 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10170 induc_def = PHI_RESULT (induction_phi);
10172 /* Create the iv update inside the loop. */
10173 stmts = NULL;
10174 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10175 induc_def, vec_step,
10176 induction_type);
10178 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10179 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10181 /* Set the arguments of the phi node: */
10182 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10183 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10184 UNKNOWN_LOCATION);
10186 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10187 *vec_stmt = induction_phi;
10189 /* In case that vectorization factor (VF) is bigger than the number
10190 of elements that we can fit in a vectype (nunits), we have to generate
10191 more than one vector stmt - i.e - we need to "unroll" the
10192 vector stmt by a factor VF/nunits. For more details see documentation
10193 in vectorizable_operation. */
10195 if (ncopies > 1)
10197 stmts = NULL;
10198 /* FORNOW. This restriction should be relaxed. */
10199 gcc_assert (!nested_in_vect_loop);
10201 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10202 nunits, induction_type);
10204 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10205 new_name, vectype,
10206 induction_type);
10207 vec_def = induc_def;
10208 for (i = 1; i < ncopies; i++)
10210 /* vec_i = vec_prev + vec_step. */
10211 stmts = NULL;
10212 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10213 vec_def, vec_step,
10214 induction_type);
10215 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10216 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10217 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10221 if (dump_enabled_p ())
10222 dump_printf_loc (MSG_NOTE, vect_location,
10223 "transform induction: created def-use cycle: %G%G",
10224 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10226 return true;
10229 /* Function vectorizable_induction
10231 Check if STMT_INFO performs an induction computation that can be vectorized.
10232 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
10233 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
10234 Return true if STMT_INFO is vectorizable in this way. */
10236 bool
10237 vectorizable_induction (loop_vec_info loop_vinfo,
10238 stmt_vec_info stmt_info,
10239 gimple **vec_stmt, slp_tree slp_node,
10240 stmt_vector_for_cost *cost_vec)
10242 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10243 unsigned ncopies;
10244 bool nested_in_vect_loop = false;
10245 class loop *iv_loop;
10246 tree vec_def;
10247 edge pe = loop_preheader_edge (loop);
10248 basic_block new_bb;
10249 tree new_vec, vec_init, vec_step, t;
10250 tree new_name;
10251 gimple *new_stmt;
10252 gphi *induction_phi;
10253 tree induc_def, vec_dest;
10254 tree init_expr, step_expr;
10255 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10256 unsigned i;
10257 tree expr;
10258 gimple_stmt_iterator si;
10259 enum vect_induction_op_type induction_type
10260 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10262 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10263 if (!phi)
10264 return false;
10266 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10267 return false;
10269 /* Make sure it was recognized as induction computation. */
10270 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
10271 return false;
10273 /* Handle nonlinear induction in a separate place. */
10274 if (induction_type != vect_step_op_add)
10275 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
10276 vec_stmt, slp_node, cost_vec);
10278 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
10279 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10281 if (slp_node)
10282 ncopies = 1;
10283 else
10284 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10285 gcc_assert (ncopies >= 1);
10287 /* FORNOW. These restrictions should be relaxed. */
10288 if (nested_in_vect_loop_p (loop, stmt_info))
10290 imm_use_iterator imm_iter;
10291 use_operand_p use_p;
10292 gimple *exit_phi;
10293 edge latch_e;
10294 tree loop_arg;
10296 if (ncopies > 1)
10298 if (dump_enabled_p ())
10299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10300 "multiple types in nested loop.\n");
10301 return false;
10304 exit_phi = NULL;
10305 latch_e = loop_latch_edge (loop->inner);
10306 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10307 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10309 gimple *use_stmt = USE_STMT (use_p);
10310 if (is_gimple_debug (use_stmt))
10311 continue;
10313 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10315 exit_phi = use_stmt;
10316 break;
10319 if (exit_phi)
10321 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10322 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10323 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10325 if (dump_enabled_p ())
10326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10327 "inner-loop induction only used outside "
10328 "of the outer vectorized loop.\n");
10329 return false;
10333 nested_in_vect_loop = true;
10334 iv_loop = loop->inner;
10336 else
10337 iv_loop = loop;
10338 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10340 if (slp_node && !nunits.is_constant ())
10342 /* The current SLP code creates the step value element-by-element. */
10343 if (dump_enabled_p ())
10344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10345 "SLP induction not supported for variable-length"
10346 " vectors.\n");
10347 return false;
10350 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10352 if (dump_enabled_p ())
10353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10354 "floating point induction vectorization disabled\n");
10355 return false;
10358 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10359 gcc_assert (step_expr != NULL_TREE);
10360 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10361 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10363 if (dump_enabled_p ())
10364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10365 "bit-precision induction vectorization not "
10366 "supported.\n");
10367 return false;
10369 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10371 /* Check for backend support of PLUS/MINUS_EXPR. */
10372 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10373 || !directly_supported_p (MINUS_EXPR, step_vectype))
10374 return false;
10376 if (!vec_stmt) /* transformation not required. */
10378 unsigned inside_cost = 0, prologue_cost = 0;
10379 if (slp_node)
10381 /* We eventually need to set a vector type on invariant
10382 arguments. */
10383 unsigned j;
10384 slp_tree child;
10385 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10386 if (!vect_maybe_update_slp_op_vectype
10387 (child, SLP_TREE_VECTYPE (slp_node)))
10389 if (dump_enabled_p ())
10390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10391 "incompatible vector types for "
10392 "invariants\n");
10393 return false;
10395 /* loop cost for vec_loop. */
10396 inside_cost
10397 = record_stmt_cost (cost_vec,
10398 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10399 vector_stmt, stmt_info, 0, vect_body);
10400 /* prologue cost for vec_init (if not nested) and step. */
10401 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10402 scalar_to_vec,
10403 stmt_info, 0, vect_prologue);
10405 else /* if (!slp_node) */
10407 /* loop cost for vec_loop. */
10408 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10409 stmt_info, 0, vect_body);
10410 /* prologue cost for vec_init and vec_step. */
10411 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10412 stmt_info, 0, vect_prologue);
10414 if (dump_enabled_p ())
10415 dump_printf_loc (MSG_NOTE, vect_location,
10416 "vect_model_induction_cost: inside_cost = %d, "
10417 "prologue_cost = %d .\n", inside_cost,
10418 prologue_cost);
10420 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10421 DUMP_VECT_SCOPE ("vectorizable_induction");
10422 return true;
10425 /* Transform. */
10427 /* Compute a vector variable, initialized with the first VF values of
10428 the induction variable. E.g., for an iv with IV_PHI='X' and
10429 evolution S, for a vector of 4 units, we want to compute:
10430 [X, X + S, X + 2*S, X + 3*S]. */
10432 if (dump_enabled_p ())
10433 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10435 pe = loop_preheader_edge (iv_loop);
10436 /* Find the first insertion point in the BB. */
10437 basic_block bb = gimple_bb (phi);
10438 si = gsi_after_labels (bb);
10440 /* For SLP induction we have to generate several IVs as for example
10441 with group size 3 we need
10442 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10443 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10444 if (slp_node)
10446 /* Enforced above. */
10447 unsigned int const_nunits = nunits.to_constant ();
10449 /* The initial values are vectorized, but any lanes > group_size
10450 need adjustment. */
10451 slp_tree init_node
10452 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10454 /* Gather steps. Since we do not vectorize inductions as
10455 cycles we have to reconstruct the step from SCEV data. */
10456 unsigned group_size = SLP_TREE_LANES (slp_node);
10457 tree *steps = XALLOCAVEC (tree, group_size);
10458 tree *inits = XALLOCAVEC (tree, group_size);
10459 stmt_vec_info phi_info;
10460 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10462 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10463 if (!init_node)
10464 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10465 pe->dest_idx);
10468 /* Now generate the IVs. */
10469 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10470 gcc_assert ((const_nunits * nvects) % group_size == 0);
10471 unsigned nivs;
10472 if (nested_in_vect_loop)
10473 nivs = nvects;
10474 else
10476 /* Compute the number of distinct IVs we need. First reduce
10477 group_size if it is a multiple of const_nunits so we get
10478 one IV for a group_size of 4 but const_nunits 2. */
10479 unsigned group_sizep = group_size;
10480 if (group_sizep % const_nunits == 0)
10481 group_sizep = group_sizep / const_nunits;
10482 nivs = least_common_multiple (group_sizep,
10483 const_nunits) / const_nunits;
10485 tree stept = TREE_TYPE (step_vectype);
10486 tree lupdate_mul = NULL_TREE;
10487 if (!nested_in_vect_loop)
10489 /* The number of iterations covered in one vector iteration. */
10490 unsigned lup_mul = (nvects * const_nunits) / group_size;
10491 lupdate_mul
10492 = build_vector_from_val (step_vectype,
10493 SCALAR_FLOAT_TYPE_P (stept)
10494 ? build_real_from_wide (stept, lup_mul,
10495 UNSIGNED)
10496 : build_int_cstu (stept, lup_mul));
10498 tree peel_mul = NULL_TREE;
10499 gimple_seq init_stmts = NULL;
10500 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10502 if (SCALAR_FLOAT_TYPE_P (stept))
10503 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10504 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10505 else
10506 peel_mul = gimple_convert (&init_stmts, stept,
10507 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10508 peel_mul = gimple_build_vector_from_val (&init_stmts,
10509 step_vectype, peel_mul);
10511 unsigned ivn;
10512 auto_vec<tree> vec_steps;
10513 for (ivn = 0; ivn < nivs; ++ivn)
10515 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10516 tree_vector_builder init_elts (vectype, const_nunits, 1);
10517 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10518 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10520 /* The scalar steps of the IVs. */
10521 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10522 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10523 step_elts.quick_push (elt);
10524 if (!init_node)
10526 /* The scalar inits of the IVs if not vectorized. */
10527 elt = inits[(ivn*const_nunits + eltn) % group_size];
10528 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10529 TREE_TYPE (elt)))
10530 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10531 TREE_TYPE (vectype), elt);
10532 init_elts.quick_push (elt);
10534 /* The number of steps to add to the initial values. */
10535 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10536 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10537 ? build_real_from_wide (stept,
10538 mul_elt, UNSIGNED)
10539 : build_int_cstu (stept, mul_elt));
10541 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10542 vec_steps.safe_push (vec_step);
10543 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10544 if (peel_mul)
10545 step_mul = gimple_build (&init_stmts, MINUS_EXPR, step_vectype,
10546 step_mul, peel_mul);
10547 if (!init_node)
10548 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10550 /* Create the induction-phi that defines the induction-operand. */
10551 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10552 "vec_iv_");
10553 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10554 induc_def = PHI_RESULT (induction_phi);
10556 /* Create the iv update inside the loop */
10557 tree up = vec_step;
10558 if (lupdate_mul)
10559 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10560 vec_step, lupdate_mul);
10561 gimple_seq stmts = NULL;
10562 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10563 vec_def = gimple_build (&stmts,
10564 PLUS_EXPR, step_vectype, vec_def, up);
10565 vec_def = gimple_convert (&stmts, vectype, vec_def);
10566 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10567 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10568 UNKNOWN_LOCATION);
10570 if (init_node)
10571 vec_init = vect_get_slp_vect_def (init_node, ivn);
10572 if (!nested_in_vect_loop
10573 && !integer_zerop (step_mul))
10575 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10576 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10577 vec_step, step_mul);
10578 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10579 vec_def, up);
10580 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10583 /* Set the arguments of the phi node: */
10584 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10586 slp_node->push_vec_def (induction_phi);
10588 if (!nested_in_vect_loop)
10590 /* Fill up to the number of vectors we need for the whole group. */
10591 nivs = least_common_multiple (group_size,
10592 const_nunits) / const_nunits;
10593 vec_steps.reserve (nivs-ivn);
10594 for (; ivn < nivs; ++ivn)
10596 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10597 vec_steps.quick_push (vec_steps[0]);
10601 /* Re-use IVs when we can. We are generating further vector
10602 stmts by adding VF' * stride to the IVs generated above. */
10603 if (ivn < nvects)
10605 unsigned vfp
10606 = least_common_multiple (group_size, const_nunits) / group_size;
10607 tree lupdate_mul
10608 = build_vector_from_val (step_vectype,
10609 SCALAR_FLOAT_TYPE_P (stept)
10610 ? build_real_from_wide (stept,
10611 vfp, UNSIGNED)
10612 : build_int_cstu (stept, vfp));
10613 for (; ivn < nvects; ++ivn)
10615 gimple *iv
10616 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10617 tree def = gimple_get_lhs (iv);
10618 if (ivn < 2*nivs)
10619 vec_steps[ivn - nivs]
10620 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10621 vec_steps[ivn - nivs], lupdate_mul);
10622 gimple_seq stmts = NULL;
10623 def = gimple_convert (&stmts, step_vectype, def);
10624 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10625 def, vec_steps[ivn % nivs]);
10626 def = gimple_convert (&stmts, vectype, def);
10627 if (gimple_code (iv) == GIMPLE_PHI)
10628 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10629 else
10631 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10632 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10634 slp_node->push_vec_def (def);
10638 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10639 gcc_assert (!new_bb);
10641 return true;
10644 init_expr = vect_phi_initial_value (phi);
10646 gimple_seq stmts = NULL;
10647 if (!nested_in_vect_loop)
10649 /* Convert the initial value to the IV update type. */
10650 tree new_type = TREE_TYPE (step_expr);
10651 init_expr = gimple_convert (&stmts, new_type, init_expr);
10653 /* If we are using the loop mask to "peel" for alignment then we need
10654 to adjust the start value here. */
10655 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10656 if (skip_niters != NULL_TREE)
10658 if (FLOAT_TYPE_P (vectype))
10659 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10660 skip_niters);
10661 else
10662 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10663 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10664 skip_niters, step_expr);
10665 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10666 init_expr, skip_step);
10670 if (stmts)
10672 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10673 gcc_assert (!new_bb);
10676 /* Create the vector that holds the initial_value of the induction. */
10677 if (nested_in_vect_loop)
10679 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10680 been created during vectorization of previous stmts. We obtain it
10681 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10682 auto_vec<tree> vec_inits;
10683 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10684 init_expr, &vec_inits);
10685 vec_init = vec_inits[0];
10686 /* If the initial value is not of proper type, convert it. */
10687 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10689 new_stmt
10690 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10691 vect_simple_var,
10692 "vec_iv_"),
10693 VIEW_CONVERT_EXPR,
10694 build1 (VIEW_CONVERT_EXPR, vectype,
10695 vec_init));
10696 vec_init = gimple_assign_lhs (new_stmt);
10697 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10698 new_stmt);
10699 gcc_assert (!new_bb);
10702 else
10704 /* iv_loop is the loop to be vectorized. Create:
10705 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10706 stmts = NULL;
10707 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10709 unsigned HOST_WIDE_INT const_nunits;
10710 if (nunits.is_constant (&const_nunits))
10712 tree_vector_builder elts (step_vectype, const_nunits, 1);
10713 elts.quick_push (new_name);
10714 for (i = 1; i < const_nunits; i++)
10716 /* Create: new_name_i = new_name + step_expr */
10717 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10718 new_name, step_expr);
10719 elts.quick_push (new_name);
10721 /* Create a vector from [new_name_0, new_name_1, ...,
10722 new_name_nunits-1] */
10723 vec_init = gimple_build_vector (&stmts, &elts);
10725 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10726 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10727 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10728 new_name, step_expr);
10729 else
10731 /* Build:
10732 [base, base, base, ...]
10733 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10734 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10735 gcc_assert (flag_associative_math);
10736 tree index = build_index_vector (step_vectype, 0, 1);
10737 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10738 new_name);
10739 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10740 step_expr);
10741 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10742 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10743 vec_init, step_vec);
10744 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10745 vec_init, base_vec);
10747 vec_init = gimple_convert (&stmts, vectype, vec_init);
10749 if (stmts)
10751 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10752 gcc_assert (!new_bb);
10757 /* Create the vector that holds the step of the induction. */
10758 gimple_stmt_iterator *step_iv_si = NULL;
10759 if (nested_in_vect_loop)
10760 /* iv_loop is nested in the loop to be vectorized. Generate:
10761 vec_step = [S, S, S, S] */
10762 new_name = step_expr;
10763 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10765 /* When we're using loop_len produced by SELEC_VL, the non-final
10766 iterations are not always processing VF elements. So vectorize
10767 induction variable instead of
10769 _21 = vect_vec_iv_.6_22 + { VF, ... };
10771 We should generate:
10773 _35 = .SELECT_VL (ivtmp_33, VF);
10774 vect_cst__22 = [vec_duplicate_expr] _35;
10775 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10776 gcc_assert (!slp_node);
10777 gimple_seq seq = NULL;
10778 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10779 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10780 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10781 unshare_expr (len)),
10782 &seq, true, NULL_TREE);
10783 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10784 step_expr);
10785 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10786 step_iv_si = &si;
10788 else
10790 /* iv_loop is the loop to be vectorized. Generate:
10791 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10792 gimple_seq seq = NULL;
10793 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10795 expr = build_int_cst (integer_type_node, vf);
10796 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10798 else
10799 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10800 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10801 expr, step_expr);
10802 if (seq)
10804 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10805 gcc_assert (!new_bb);
10809 t = unshare_expr (new_name);
10810 gcc_assert (CONSTANT_CLASS_P (new_name)
10811 || TREE_CODE (new_name) == SSA_NAME);
10812 new_vec = build_vector_from_val (step_vectype, t);
10813 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10814 new_vec, step_vectype, step_iv_si);
10817 /* Create the following def-use cycle:
10818 loop prolog:
10819 vec_init = ...
10820 vec_step = ...
10821 loop:
10822 vec_iv = PHI <vec_init, vec_loop>
10824 STMT
10826 vec_loop = vec_iv + vec_step; */
10828 /* Create the induction-phi that defines the induction-operand. */
10829 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10830 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10831 induc_def = PHI_RESULT (induction_phi);
10833 /* Create the iv update inside the loop */
10834 stmts = NULL;
10835 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10836 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10837 vec_def = gimple_convert (&stmts, vectype, vec_def);
10838 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10839 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10841 /* Set the arguments of the phi node: */
10842 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10843 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10844 UNKNOWN_LOCATION);
10846 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10847 *vec_stmt = induction_phi;
10849 /* In case that vectorization factor (VF) is bigger than the number
10850 of elements that we can fit in a vectype (nunits), we have to generate
10851 more than one vector stmt - i.e - we need to "unroll" the
10852 vector stmt by a factor VF/nunits. For more details see documentation
10853 in vectorizable_operation. */
10855 if (ncopies > 1)
10857 gimple_seq seq = NULL;
10858 /* FORNOW. This restriction should be relaxed. */
10859 gcc_assert (!nested_in_vect_loop);
10860 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10861 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10863 /* Create the vector that holds the step of the induction. */
10864 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10866 expr = build_int_cst (integer_type_node, nunits);
10867 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10869 else
10870 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10871 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10872 expr, step_expr);
10873 if (seq)
10875 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10876 gcc_assert (!new_bb);
10879 t = unshare_expr (new_name);
10880 gcc_assert (CONSTANT_CLASS_P (new_name)
10881 || TREE_CODE (new_name) == SSA_NAME);
10882 new_vec = build_vector_from_val (step_vectype, t);
10883 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10884 new_vec, step_vectype, NULL);
10886 vec_def = induc_def;
10887 for (i = 1; i < ncopies + 1; i++)
10889 /* vec_i = vec_prev + vec_step */
10890 gimple_seq stmts = NULL;
10891 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10892 vec_def = gimple_build (&stmts,
10893 PLUS_EXPR, step_vectype, vec_def, vec_step);
10894 vec_def = gimple_convert (&stmts, vectype, vec_def);
10896 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10897 if (i < ncopies)
10899 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10900 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10902 else
10904 /* vec_1 = vec_iv + (VF/n * S)
10905 vec_2 = vec_1 + (VF/n * S)
10907 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10909 vec_n is used as vec_loop to save the large step register and
10910 related operations. */
10911 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10912 UNKNOWN_LOCATION);
10917 if (dump_enabled_p ())
10918 dump_printf_loc (MSG_NOTE, vect_location,
10919 "transform induction: created def-use cycle: %G%G",
10920 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10922 return true;
10925 /* Function vectorizable_live_operation_1.
10927 helper function for vectorizable_live_operation. */
10929 static tree
10930 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10931 stmt_vec_info stmt_info, basic_block exit_bb,
10932 tree vectype, int ncopies, slp_tree slp_node,
10933 tree bitsize, tree bitstart, tree vec_lhs,
10934 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10936 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10938 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10939 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10940 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10941 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10943 gimple_seq stmts = NULL;
10944 tree new_tree;
10946 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10947 if (integer_zerop (bitstart))
10949 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10950 vec_lhs_phi, bitsize, bitstart);
10952 /* Convert the extracted vector element to the scalar type. */
10953 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10955 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10957 /* Emit:
10959 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10961 where VEC_LHS is the vectorized live-out result and MASK is
10962 the loop mask for the final iteration. */
10963 gcc_assert (ncopies == 1
10964 && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10965 gimple_seq tem = NULL;
10966 gimple_stmt_iterator gsi = gsi_last (tem);
10967 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10968 &LOOP_VINFO_LENS (loop_vinfo),
10969 1, vectype, 0, 0);
10971 /* BIAS - 1. */
10972 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10973 tree bias_minus_one
10974 = int_const_binop (MINUS_EXPR,
10975 build_int_cst (TREE_TYPE (len), biasval),
10976 build_one_cst (TREE_TYPE (len)));
10978 /* LAST_INDEX = LEN + (BIAS - 1). */
10979 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10980 len, bias_minus_one);
10982 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10983 tree scalar_res
10984 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10985 vec_lhs_phi, last_index);
10987 /* Convert the extracted vector element to the scalar type. */
10988 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10990 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10992 /* Emit:
10994 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10996 where VEC_LHS is the vectorized live-out result and MASK is
10997 the loop mask for the final iteration. */
10998 gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1);
10999 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
11000 gimple_seq tem = NULL;
11001 gimple_stmt_iterator gsi = gsi_last (tem);
11002 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
11003 &LOOP_VINFO_MASKS (loop_vinfo),
11004 1, vectype, 0);
11005 tree scalar_res;
11006 gimple_seq_add_seq (&stmts, tem);
11008 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
11009 mask, vec_lhs_phi);
11011 /* Convert the extracted vector element to the scalar type. */
11012 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
11014 else
11016 tree bftype = TREE_TYPE (vectype);
11017 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11018 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11019 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
11020 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11021 &stmts, true, NULL_TREE);
11024 *exit_gsi = gsi_after_labels (exit_bb);
11025 if (stmts)
11026 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
11028 return new_tree;
11031 /* Function vectorizable_live_operation.
11033 STMT_INFO computes a value that is used outside the loop. Check if
11034 it can be supported. */
11036 bool
11037 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
11038 slp_tree slp_node, slp_instance slp_node_instance,
11039 int slp_index, bool vec_stmt_p,
11040 stmt_vector_for_cost *cost_vec)
11042 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
11043 imm_use_iterator imm_iter;
11044 tree lhs, lhs_type, bitsize;
11045 tree vectype = (slp_node
11046 ? SLP_TREE_VECTYPE (slp_node)
11047 : STMT_VINFO_VECTYPE (stmt_info));
11048 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11049 int ncopies;
11050 gimple *use_stmt;
11051 use_operand_p use_p;
11052 auto_vec<tree> vec_oprnds;
11053 int vec_entry = 0;
11054 poly_uint64 vec_index = 0;
11056 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
11057 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
11059 /* If a stmt of a reduction is live, vectorize it via
11060 vect_create_epilog_for_reduction. vectorizable_reduction assessed
11061 validity so just trigger the transform here. */
11062 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
11064 if (!vec_stmt_p)
11065 return true;
11066 /* For SLP reductions we vectorize the epilogue for all involved stmts
11067 together. */
11068 if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
11069 return true;
11070 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
11071 gcc_assert (reduc_info->is_reduc_info);
11072 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
11073 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
11074 return true;
11076 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
11077 || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11078 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
11079 slp_node_instance,
11080 LOOP_VINFO_IV_EXIT (loop_vinfo));
11082 /* If early break we only have to materialize the reduction on the merge
11083 block, but we have to find an alternate exit first. */
11084 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11086 slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
11087 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11088 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
11090 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11091 phis_node, slp_node_instance,
11092 exit);
11093 break;
11095 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11096 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11097 phis_node, slp_node_instance,
11098 LOOP_VINFO_IV_EXIT (loop_vinfo));
11101 return true;
11104 /* If STMT is not relevant and it is a simple assignment and its inputs are
11105 invariant then it can remain in place, unvectorized. The original last
11106 scalar value that it computes will be used. */
11107 if (!STMT_VINFO_RELEVANT_P (stmt_info))
11109 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
11110 if (dump_enabled_p ())
11111 dump_printf_loc (MSG_NOTE, vect_location,
11112 "statement is simple and uses invariant. Leaving in "
11113 "place.\n");
11114 return true;
11117 if (slp_node)
11118 ncopies = 1;
11119 else
11120 ncopies = vect_get_num_copies (loop_vinfo, vectype);
11122 if (slp_node)
11124 gcc_assert (slp_index >= 0);
11126 /* Get the last occurrence of the scalar index from the concatenation of
11127 all the slp vectors. Calculate which slp vector it is and the index
11128 within. */
11129 int num_scalar = SLP_TREE_LANES (slp_node);
11130 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
11131 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
11133 /* Calculate which vector contains the result, and which lane of
11134 that vector we need. */
11135 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
11137 if (dump_enabled_p ())
11138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11139 "Cannot determine which vector holds the"
11140 " final result.\n");
11141 return false;
11145 if (!vec_stmt_p)
11147 /* No transformation required. */
11148 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
11150 if (slp_node && SLP_TREE_LANES (slp_node) != 1)
11152 if (dump_enabled_p ())
11153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11154 "can't operate on partial vectors "
11155 "because an SLP statement is live after "
11156 "the loop.\n");
11157 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11159 else if (ncopies > 1
11160 || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
11162 if (dump_enabled_p ())
11163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11164 "can't operate on partial vectors "
11165 "because ncopies is greater than 1.\n");
11166 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11168 else
11170 gcc_assert (ncopies == 1
11171 && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
11172 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
11173 OPTIMIZE_FOR_SPEED))
11174 vect_record_loop_mask (loop_vinfo,
11175 &LOOP_VINFO_MASKS (loop_vinfo),
11176 1, vectype, NULL);
11177 else if (can_vec_extract_var_idx_p (
11178 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
11179 vect_record_loop_len (loop_vinfo,
11180 &LOOP_VINFO_LENS (loop_vinfo),
11181 1, vectype, 1);
11182 else
11184 if (dump_enabled_p ())
11185 dump_printf_loc (
11186 MSG_MISSED_OPTIMIZATION, vect_location,
11187 "can't operate on partial vectors "
11188 "because the target doesn't support extract "
11189 "last reduction.\n");
11190 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11194 /* ??? Enable for loop costing as well. */
11195 if (!loop_vinfo)
11196 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
11197 0, vect_epilogue);
11198 return true;
11201 /* Use the lhs of the original scalar statement. */
11202 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
11203 if (dump_enabled_p ())
11204 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
11205 "stmt %G", stmt);
11207 lhs = gimple_get_lhs (stmt);
11208 lhs_type = TREE_TYPE (lhs);
11210 bitsize = vector_element_bits_tree (vectype);
11212 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
11213 tree vec_lhs, vec_lhs0, bitstart;
11214 gimple *vec_stmt, *vec_stmt0;
11215 if (slp_node)
11217 gcc_assert (!loop_vinfo
11218 || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11219 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11220 || SLP_TREE_LANES (slp_node) == 1));
11222 /* Get the correct slp vectorized stmt. */
11223 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
11224 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
11226 /* In case we need to early break vectorize also get the first stmt. */
11227 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
11228 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
11230 /* Get entry to use. */
11231 bitstart = bitsize_int (vec_index);
11232 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
11234 else
11236 /* For multiple copies, get the last copy. */
11237 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
11238 vec_lhs = gimple_get_lhs (vec_stmt);
11240 /* In case we need to early break vectorize also get the first stmt. */
11241 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11242 vec_lhs0 = gimple_get_lhs (vec_stmt0);
11244 /* Get the last lane in the vector. */
11245 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
11248 if (loop_vinfo)
11250 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
11251 requirement, insert one phi node for it. It looks like:
11252 loop;
11254 # lhs' = PHI <lhs>
11256 loop;
11258 # vec_lhs' = PHI <vec_lhs>
11259 new_tree = lane_extract <vec_lhs', ...>;
11260 lhs' = new_tree; */
11262 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11263 /* Check if we have a loop where the chosen exit is not the main exit,
11264 in these cases for an early break we restart the iteration the vector code
11265 did. For the live values we want the value at the start of the iteration
11266 rather than at the end. */
11267 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11268 bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
11269 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11270 if (!is_gimple_debug (use_stmt)
11271 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
11272 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11274 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
11275 phi_arg_index_from_use (use_p));
11276 gcc_assert (loop_exit_edge_p (loop, e));
11277 bool main_exit_edge = e == main_e;
11278 tree tmp_vec_lhs = vec_lhs;
11279 tree tmp_bitstart = bitstart;
11281 /* For early exit where the exit is not in the BB that leads
11282 to the latch then we're restarting the iteration in the
11283 scalar loop. So get the first live value. */
11284 if ((all_exits_as_early_p || !main_exit_edge)
11285 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
11287 tmp_vec_lhs = vec_lhs0;
11288 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
11291 gimple_stmt_iterator exit_gsi;
11292 tree new_tree
11293 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
11294 e->dest, vectype, ncopies,
11295 slp_node, bitsize,
11296 tmp_bitstart, tmp_vec_lhs,
11297 lhs_type, &exit_gsi);
11299 auto gsi = gsi_for_stmt (use_stmt);
11300 tree lhs_phi = gimple_phi_result (use_stmt);
11301 remove_phi_node (&gsi, false);
11302 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11303 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11304 break;
11307 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11308 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11309 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11311 else
11313 /* For basic-block vectorization simply insert the lane-extraction. */
11314 tree bftype = TREE_TYPE (vectype);
11315 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11316 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11317 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11318 vec_lhs, bitsize, bitstart);
11319 gimple_seq stmts = NULL;
11320 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11321 &stmts, true, NULL_TREE);
11322 if (TREE_CODE (new_tree) == SSA_NAME
11323 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11324 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11325 if (is_a <gphi *> (vec_stmt))
11327 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11328 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11330 else
11332 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11333 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11336 /* Replace use of lhs with newly computed result. If the use stmt is a
11337 single arg PHI, just replace all uses of PHI result. It's necessary
11338 because lcssa PHI defining lhs may be before newly inserted stmt. */
11339 use_operand_p use_p;
11340 stmt_vec_info use_stmt_info;
11341 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11342 if (!is_gimple_debug (use_stmt)
11343 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11344 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11346 /* ??? This can happen when the live lane ends up being
11347 rooted in a vector construction code-generated by an
11348 external SLP node (and code-generation for that already
11349 happened). See gcc.dg/vect/bb-slp-47.c.
11350 Doing this is what would happen if that vector CTOR
11351 were not code-generated yet so it is not too bad.
11352 ??? In fact we'd likely want to avoid this situation
11353 in the first place. */
11354 if (TREE_CODE (new_tree) == SSA_NAME
11355 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11356 && gimple_code (use_stmt) != GIMPLE_PHI
11357 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11358 use_stmt))
11360 if (dump_enabled_p ())
11361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11362 "Using original scalar computation for "
11363 "live lane because use preceeds vector "
11364 "def\n");
11365 continue;
11367 /* ??? It can also happen that we end up pulling a def into
11368 a loop where replacing out-of-loop uses would require
11369 a new LC SSA PHI node. Retain the original scalar in
11370 those cases as well. PR98064. */
11371 if (TREE_CODE (new_tree) == SSA_NAME
11372 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11373 && (gimple_bb (use_stmt)->loop_father
11374 != gimple_bb (vec_stmt)->loop_father)
11375 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11376 gimple_bb (use_stmt)->loop_father))
11378 if (dump_enabled_p ())
11379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11380 "Using original scalar computation for "
11381 "live lane because there is an out-of-loop "
11382 "definition for it\n");
11383 continue;
11385 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11386 SET_USE (use_p, new_tree);
11387 update_stmt (use_stmt);
11391 return true;
11394 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11396 static void
11397 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11399 ssa_op_iter op_iter;
11400 imm_use_iterator imm_iter;
11401 def_operand_p def_p;
11402 gimple *ustmt;
11404 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11406 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11408 basic_block bb;
11410 if (!is_gimple_debug (ustmt))
11411 continue;
11413 bb = gimple_bb (ustmt);
11415 if (!flow_bb_inside_loop_p (loop, bb))
11417 if (gimple_debug_bind_p (ustmt))
11419 if (dump_enabled_p ())
11420 dump_printf_loc (MSG_NOTE, vect_location,
11421 "killing debug use\n");
11423 gimple_debug_bind_reset_value (ustmt);
11424 update_stmt (ustmt);
11426 else
11427 gcc_unreachable ();
11433 /* Given loop represented by LOOP_VINFO, return true if computation of
11434 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11435 otherwise. */
11437 static bool
11438 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11440 /* Constant case. */
11441 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11443 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11444 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11446 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11447 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11448 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11449 return true;
11452 widest_int max;
11453 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11454 /* Check the upper bound of loop niters. */
11455 if (get_max_loop_iterations (loop, &max))
11457 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11458 signop sgn = TYPE_SIGN (type);
11459 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11460 if (max < type_max)
11461 return true;
11463 return false;
11466 /* Return a mask type with half the number of elements as OLD_TYPE,
11467 given that it should have mode NEW_MODE. */
11469 tree
11470 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11472 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11473 return build_truth_vector_type_for_mode (nunits, new_mode);
11476 /* Return a mask type with twice as many elements as OLD_TYPE,
11477 given that it should have mode NEW_MODE. */
11479 tree
11480 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11482 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11483 return build_truth_vector_type_for_mode (nunits, new_mode);
11486 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11487 contain a sequence of NVECTORS masks that each control a vector of type
11488 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11489 these vector masks with the vector version of SCALAR_MASK. */
11491 void
11492 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11493 unsigned int nvectors, tree vectype, tree scalar_mask)
11495 gcc_assert (nvectors != 0);
11497 if (scalar_mask)
11499 scalar_cond_masked_key cond (scalar_mask, nvectors);
11500 loop_vinfo->scalar_cond_masked_set.add (cond);
11503 masks->mask_set.add (std::make_pair (vectype, nvectors));
11506 /* Given a complete set of masks MASKS, extract mask number INDEX
11507 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11508 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11510 See the comment above vec_loop_masks for more details about the mask
11511 arrangement. */
11513 tree
11514 vect_get_loop_mask (loop_vec_info loop_vinfo,
11515 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11516 unsigned int nvectors, tree vectype, unsigned int index)
11518 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11519 == vect_partial_vectors_while_ult)
11521 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11522 tree mask_type = rgm->type;
11524 /* Populate the rgroup's mask array, if this is the first time we've
11525 used it. */
11526 if (rgm->controls.is_empty ())
11528 rgm->controls.safe_grow_cleared (nvectors, true);
11529 for (unsigned int i = 0; i < nvectors; ++i)
11531 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11532 /* Provide a dummy definition until the real one is available. */
11533 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11534 rgm->controls[i] = mask;
11538 tree mask = rgm->controls[index];
11539 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11540 TYPE_VECTOR_SUBPARTS (vectype)))
11542 /* A loop mask for data type X can be reused for data type Y
11543 if X has N times more elements than Y and if Y's elements
11544 are N times bigger than X's. In this case each sequence
11545 of N elements in the loop mask will be all-zero or all-one.
11546 We can then view-convert the mask so that each sequence of
11547 N elements is replaced by a single element. */
11548 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11549 TYPE_VECTOR_SUBPARTS (vectype)));
11550 gimple_seq seq = NULL;
11551 mask_type = truth_type_for (vectype);
11552 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11553 if (seq)
11554 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11556 return mask;
11558 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11559 == vect_partial_vectors_avx512)
11561 /* The number of scalars per iteration and the number of vectors are
11562 both compile-time constants. */
11563 unsigned int nscalars_per_iter
11564 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11565 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11567 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11569 /* The stored nV is dependent on the mask type produced. */
11570 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11571 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11572 == rgm->factor);
11573 nvectors = rgm->factor;
11575 /* Populate the rgroup's mask array, if this is the first time we've
11576 used it. */
11577 if (rgm->controls.is_empty ())
11579 rgm->controls.safe_grow_cleared (nvectors, true);
11580 for (unsigned int i = 0; i < nvectors; ++i)
11582 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11583 /* Provide a dummy definition until the real one is available. */
11584 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11585 rgm->controls[i] = mask;
11588 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11589 TYPE_VECTOR_SUBPARTS (vectype)))
11590 return rgm->controls[index];
11592 /* Split the vector if needed. Since we are dealing with integer mode
11593 masks with AVX512 we can operate on the integer representation
11594 performing the whole vector shifting. */
11595 unsigned HOST_WIDE_INT factor;
11596 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11597 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11598 gcc_assert (ok);
11599 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11600 tree mask_type = truth_type_for (vectype);
11601 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11602 unsigned vi = index / factor;
11603 unsigned vpart = index % factor;
11604 tree vec = rgm->controls[vi];
11605 gimple_seq seq = NULL;
11606 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11607 lang_hooks.types.type_for_mode
11608 (TYPE_MODE (rgm->type), 1), vec);
11609 /* For integer mode masks simply shift the right bits into position. */
11610 if (vpart != 0)
11611 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11612 build_int_cst (integer_type_node,
11613 (TYPE_VECTOR_SUBPARTS (vectype)
11614 * vpart)));
11615 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11616 (TYPE_MODE (mask_type), 1), vec);
11617 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11618 if (seq)
11619 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11620 return vec;
11622 else
11623 gcc_unreachable ();
11626 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11627 lengths for controlling an operation on VECTYPE. The operation splits
11628 each element of VECTYPE into FACTOR separate subelements, measuring the
11629 length as a number of these subelements. */
11631 void
11632 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11633 unsigned int nvectors, tree vectype, unsigned int factor)
11635 gcc_assert (nvectors != 0);
11636 if (lens->length () < nvectors)
11637 lens->safe_grow_cleared (nvectors, true);
11638 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11640 /* The number of scalars per iteration, scalar occupied bytes and
11641 the number of vectors are both compile-time constants. */
11642 unsigned int nscalars_per_iter
11643 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11644 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11646 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11648 /* For now, we only support cases in which all loads and stores fall back
11649 to VnQI or none do. */
11650 gcc_assert (!rgl->max_nscalars_per_iter
11651 || (rgl->factor == 1 && factor == 1)
11652 || (rgl->max_nscalars_per_iter * rgl->factor
11653 == nscalars_per_iter * factor));
11654 rgl->max_nscalars_per_iter = nscalars_per_iter;
11655 rgl->type = vectype;
11656 rgl->factor = factor;
11660 /* Given a complete set of lengths LENS, extract length number INDEX
11661 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11662 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11663 multipled by the number of elements that should be processed.
11664 Insert any set-up statements before GSI. */
11666 tree
11667 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11668 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11669 unsigned int index, unsigned int factor)
11671 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11672 bool use_bias_adjusted_len =
11673 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11675 /* Populate the rgroup's len array, if this is the first time we've
11676 used it. */
11677 if (rgl->controls.is_empty ())
11679 rgl->controls.safe_grow_cleared (nvectors, true);
11680 for (unsigned int i = 0; i < nvectors; ++i)
11682 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11683 gcc_assert (len_type != NULL_TREE);
11685 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11687 /* Provide a dummy definition until the real one is available. */
11688 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11689 rgl->controls[i] = len;
11691 if (use_bias_adjusted_len)
11693 gcc_assert (i == 0);
11694 tree adjusted_len =
11695 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11696 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11697 rgl->bias_adjusted_ctrl = adjusted_len;
11702 if (use_bias_adjusted_len)
11703 return rgl->bias_adjusted_ctrl;
11705 tree loop_len = rgl->controls[index];
11706 if (rgl->factor == 1 && factor == 1)
11708 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11709 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11710 if (maybe_ne (nunits1, nunits2))
11712 /* A loop len for data type X can be reused for data type Y
11713 if X has N times more elements than Y and if Y's elements
11714 are N times bigger than X's. */
11715 gcc_assert (multiple_p (nunits1, nunits2));
11716 factor = exact_div (nunits1, nunits2).to_constant ();
11717 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11718 gimple_seq seq = NULL;
11719 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11720 build_int_cst (iv_type, factor));
11721 if (seq)
11722 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11725 return loop_len;
11728 /* Generate the tree for the loop len mask and return it. Given the lens,
11729 nvectors, vectype, index and factor to gen the len mask as below.
11731 tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11733 tree
11734 vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11735 gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
11736 unsigned int nvectors, tree vectype, tree stmt,
11737 unsigned int index, unsigned int factor)
11739 tree all_one_mask = build_all_ones_cst (vectype);
11740 tree all_zero_mask = build_zero_cst (vectype);
11741 tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
11742 factor);
11743 tree bias = build_int_cst (intQI_type_node,
11744 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
11745 tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
11746 gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
11747 all_one_mask, all_zero_mask, len,
11748 bias);
11749 gimple_call_set_lhs (call, len_mask);
11750 gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
11752 return len_mask;
11755 /* Scale profiling counters by estimation for LOOP which is vectorized
11756 by factor VF.
11757 If FLAT is true, the loop we started with had unrealistically flat
11758 profile. */
11760 static void
11761 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11763 /* For flat profiles do not scale down proportionally by VF and only
11764 cap by known iteration count bounds. */
11765 if (flat)
11767 if (dump_file && (dump_flags & TDF_DETAILS))
11768 fprintf (dump_file,
11769 "Vectorized loop profile seems flat; not scaling iteration "
11770 "count down by the vectorization factor %i\n", vf);
11771 scale_loop_profile (loop, profile_probability::always (),
11772 get_likely_max_loop_iterations_int (loop));
11773 return;
11775 /* Loop body executes VF fewer times and exit increases VF times. */
11776 profile_count entry_count = loop_preheader_edge (loop)->count ();
11778 /* If we have unreliable loop profile avoid dropping entry
11779 count bellow header count. This can happen since loops
11780 has unrealistically low trip counts. */
11781 while (vf > 1
11782 && loop->header->count > entry_count
11783 && loop->header->count < entry_count * vf)
11785 if (dump_file && (dump_flags & TDF_DETAILS))
11786 fprintf (dump_file,
11787 "Vectorization factor %i seems too large for profile "
11788 "prevoiusly believed to be consistent; reducing.\n", vf);
11789 vf /= 2;
11792 if (entry_count.nonzero_p ())
11793 set_edge_probability_and_rescale_others
11794 (exit_e,
11795 entry_count.probability_in (loop->header->count / vf));
11796 /* Avoid producing very large exit probability when we do not have
11797 sensible profile. */
11798 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11799 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11800 loop->latch->count = single_pred_edge (loop->latch)->count ();
11802 scale_loop_profile (loop, profile_probability::always () / vf,
11803 get_likely_max_loop_iterations_int (loop));
11806 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11807 latch edge values originally defined by it. */
11809 static void
11810 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11811 stmt_vec_info def_stmt_info)
11813 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11814 if (!def || TREE_CODE (def) != SSA_NAME)
11815 return;
11816 stmt_vec_info phi_info;
11817 imm_use_iterator iter;
11818 use_operand_p use_p;
11819 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11821 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11822 if (!phi)
11823 continue;
11824 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11825 && (phi_info = loop_vinfo->lookup_stmt (phi))
11826 && STMT_VINFO_RELEVANT_P (phi_info)))
11827 continue;
11828 loop_p loop = gimple_bb (phi)->loop_father;
11829 edge e = loop_latch_edge (loop);
11830 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11831 continue;
11833 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11834 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11835 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11837 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11838 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11839 gcc_assert (phi_defs.length () == latch_defs.length ());
11840 for (unsigned i = 0; i < phi_defs.length (); ++i)
11841 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11842 gimple_get_lhs (latch_defs[i]), e,
11843 gimple_phi_arg_location (phi, e->dest_idx));
11845 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11847 /* For first order recurrences we have to update both uses of
11848 the latch definition, the one in the PHI node and the one
11849 in the generated VEC_PERM_EXPR. */
11850 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11851 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11852 gcc_assert (phi_defs.length () == latch_defs.length ());
11853 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11854 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11855 for (unsigned i = 0; i < phi_defs.length (); ++i)
11857 gassign *perm = as_a <gassign *> (phi_defs[i]);
11858 if (i > 0)
11859 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11860 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11861 update_stmt (perm);
11863 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11864 gimple_phi_arg_location (phi, e->dest_idx));
11869 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11870 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11871 stmt_vec_info. */
11873 static bool
11874 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11875 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11877 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11878 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11880 if (dump_enabled_p ())
11881 dump_printf_loc (MSG_NOTE, vect_location,
11882 "------>vectorizing statement: %G", stmt_info->stmt);
11884 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11885 vect_loop_kill_debug_uses (loop, stmt_info);
11887 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11888 && !STMT_VINFO_LIVE_P (stmt_info))
11890 if (is_gimple_call (stmt_info->stmt)
11891 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11893 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11894 *seen_store = stmt_info;
11895 return false;
11897 return false;
11900 if (STMT_VINFO_VECTYPE (stmt_info))
11902 poly_uint64 nunits
11903 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11904 if (!STMT_SLP_TYPE (stmt_info)
11905 && maybe_ne (nunits, vf)
11906 && dump_enabled_p ())
11907 /* For SLP VF is set according to unrolling factor, and not
11908 to vector size, hence for SLP this print is not valid. */
11909 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11912 /* Pure SLP statements have already been vectorized. We still need
11913 to apply loop vectorization to hybrid SLP statements. */
11914 if (PURE_SLP_STMT (stmt_info))
11915 return false;
11917 if (dump_enabled_p ())
11918 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11920 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11921 *seen_store = stmt_info;
11923 return true;
11926 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11927 in the hash_map with its corresponding values. */
11929 static tree
11930 find_in_mapping (tree t, void *context)
11932 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11934 tree *value = mapping->get (t);
11935 return value ? *value : t;
11938 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11939 original loop that has now been vectorized.
11941 The inits of the data_references need to be advanced with the number of
11942 iterations of the main loop. This has been computed in vect_do_peeling and
11943 is stored in parameter ADVANCE. We first restore the data_references
11944 initial offset with the values recored in ORIG_DRS_INIT.
11946 Since the loop_vec_info of this EPILOGUE was constructed for the original
11947 loop, its stmt_vec_infos all point to the original statements. These need
11948 to be updated to point to their corresponding copies as well as the SSA_NAMES
11949 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11951 The data_reference's connections also need to be updated. Their
11952 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11953 stmt_vec_infos, their statements need to point to their corresponding copy,
11954 if they are gather loads or scatter stores then their reference needs to be
11955 updated to point to its corresponding copy. */
11957 static void
11958 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11960 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11961 auto_vec<gimple *> stmt_worklist;
11962 hash_map<tree,tree> mapping;
11963 gimple *orig_stmt, *new_stmt;
11964 gimple_stmt_iterator epilogue_gsi;
11965 gphi_iterator epilogue_phi_gsi;
11966 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11967 basic_block *epilogue_bbs = get_loop_body (epilogue);
11968 unsigned i;
11970 free (LOOP_VINFO_BBS (epilogue_vinfo));
11971 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11972 LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
11974 /* Advance data_reference's with the number of iterations of the previous
11975 loop and its prologue. */
11976 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11979 /* The EPILOGUE loop is a copy of the original loop so they share the same
11980 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11981 point to the copied statements. We also create a mapping of all LHS' in
11982 the original loop and all the LHS' in the EPILOGUE and create worklists to
11983 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11984 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11986 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11987 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11989 new_stmt = epilogue_phi_gsi.phi ();
11991 gcc_assert (gimple_uid (new_stmt) > 0);
11992 stmt_vinfo
11993 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11995 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11996 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11998 mapping.put (gimple_phi_result (orig_stmt),
11999 gimple_phi_result (new_stmt));
12000 /* PHI nodes can not have patterns or related statements. */
12001 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
12002 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
12005 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
12006 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
12008 new_stmt = gsi_stmt (epilogue_gsi);
12009 if (is_gimple_debug (new_stmt))
12010 continue;
12012 gcc_assert (gimple_uid (new_stmt) > 0);
12013 stmt_vinfo
12014 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
12016 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
12017 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
12019 if (tree old_lhs = gimple_get_lhs (orig_stmt))
12020 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
12022 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
12024 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
12025 for (gimple_stmt_iterator gsi = gsi_start (seq);
12026 !gsi_end_p (gsi); gsi_next (&gsi))
12027 stmt_worklist.safe_push (gsi_stmt (gsi));
12030 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
12031 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
12033 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
12034 stmt_worklist.safe_push (stmt);
12035 /* Set BB such that the assert in
12036 'get_initial_def_for_reduction' is able to determine that
12037 the BB of the related stmt is inside this loop. */
12038 gimple_set_bb (stmt,
12039 gimple_bb (new_stmt));
12040 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
12041 gcc_assert (related_vinfo == NULL
12042 || related_vinfo == stmt_vinfo);
12047 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
12048 using the original main loop and thus need to be updated to refer to the
12049 cloned variables used in the epilogue. */
12050 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
12052 gimple *stmt = stmt_worklist[i];
12053 tree *new_op;
12055 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
12057 tree op = gimple_op (stmt, j);
12058 if ((new_op = mapping.get(op)))
12059 gimple_set_op (stmt, j, *new_op);
12060 else
12062 /* PR92429: The last argument of simplify_replace_tree disables
12063 folding when replacing arguments. This is required as
12064 otherwise you might end up with different statements than the
12065 ones analyzed in vect_loop_analyze, leading to different
12066 vectorization. */
12067 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
12068 &find_in_mapping, &mapping, false);
12069 gimple_set_op (stmt, j, op);
12074 struct data_reference *dr;
12075 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
12076 FOR_EACH_VEC_ELT (datarefs, i, dr)
12078 orig_stmt = DR_STMT (dr);
12079 gcc_assert (gimple_uid (orig_stmt) > 0);
12080 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
12081 /* Data references for gather loads and scatter stores do not use the
12082 updated offset we set using ADVANCE. Instead we have to make sure the
12083 reference in the data references point to the corresponding copy of
12084 the original in the epilogue. Make sure to update both
12085 gather/scatters recognized by dataref analysis and also other
12086 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
12087 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
12088 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
12089 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
12091 DR_REF (dr)
12092 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
12093 &find_in_mapping, &mapping);
12094 DR_BASE_ADDRESS (dr)
12095 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
12096 &find_in_mapping, &mapping);
12098 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
12099 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
12102 epilogue_vinfo->shared->datarefs_copy.release ();
12103 epilogue_vinfo->shared->save_datarefs ();
12106 /* When vectorizing early break statements instructions that happen before
12107 the early break in the current BB need to be moved to after the early
12108 break. This function deals with that and assumes that any validity
12109 checks has already been performed.
12111 While moving the instructions if it encounters a VUSE or VDEF it then
12112 corrects the VUSES as it moves the statements along. GDEST is the location
12113 in which to insert the new statements. */
12115 static void
12116 move_early_exit_stmts (loop_vec_info loop_vinfo)
12118 DUMP_VECT_SCOPE ("move_early_exit_stmts");
12120 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
12121 return;
12123 /* Move all stmts that need moving. */
12124 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
12125 gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
12127 tree last_seen_vuse = NULL_TREE;
12128 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
12130 /* We have to update crossed degenerate virtual PHIs. Simply
12131 elide them. */
12132 if (gphi *vphi = dyn_cast <gphi *> (stmt))
12134 tree vdef = gimple_phi_result (vphi);
12135 tree vuse = gimple_phi_arg_def (vphi, 0);
12136 imm_use_iterator iter;
12137 use_operand_p use_p;
12138 gimple *use_stmt;
12139 FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
12141 FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
12142 SET_USE (use_p, vuse);
12144 auto gsi = gsi_for_stmt (stmt);
12145 remove_phi_node (&gsi, true);
12146 last_seen_vuse = vuse;
12147 continue;
12150 /* Check to see if statement is still required for vect or has been
12151 elided. */
12152 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
12153 if (!stmt_info)
12154 continue;
12156 if (dump_enabled_p ())
12157 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
12159 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
12160 gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
12161 last_seen_vuse = gimple_vuse (stmt);
12164 /* Update all the stmts with their new reaching VUSES. */
12165 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
12167 if (dump_enabled_p ())
12168 dump_printf_loc (MSG_NOTE, vect_location,
12169 "updating vuse to %T for load %G",
12170 last_seen_vuse, p);
12171 gimple_set_vuse (p, last_seen_vuse);
12172 update_stmt (p);
12175 /* And update the LC PHIs on exits. */
12176 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
12177 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
12178 if (gphi *phi = get_virtual_phi (e->dest))
12179 SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
12182 /* Function vect_transform_loop.
12184 The analysis phase has determined that the loop is vectorizable.
12185 Vectorize the loop - created vectorized stmts to replace the scalar
12186 stmts in the loop, and update the loop exit condition.
12187 Returns scalar epilogue loop if any. */
12189 class loop *
12190 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
12192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12193 class loop *epilogue = NULL;
12194 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
12195 int nbbs = loop->num_nodes;
12196 int i;
12197 tree niters_vector = NULL_TREE;
12198 tree step_vector = NULL_TREE;
12199 tree niters_vector_mult_vf = NULL_TREE;
12200 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12201 unsigned int lowest_vf = constant_lower_bound (vf);
12202 gimple *stmt;
12203 bool check_profitability = false;
12204 unsigned int th;
12205 bool flat = maybe_flat_loop_profile (loop);
12207 DUMP_VECT_SCOPE ("vec_transform_loop");
12209 loop_vinfo->shared->check_datarefs ();
12211 /* Use the more conservative vectorization threshold. If the number
12212 of iterations is constant assume the cost check has been performed
12213 by our caller. If the threshold makes all loops profitable that
12214 run at least the (estimated) vectorization factor number of times
12215 checking is pointless, too. */
12216 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
12217 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
12219 if (dump_enabled_p ())
12220 dump_printf_loc (MSG_NOTE, vect_location,
12221 "Profitability threshold is %d loop iterations.\n",
12222 th);
12223 check_profitability = true;
12226 /* Make sure there exists a single-predecessor exit bb. Do this before
12227 versioning. */
12228 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
12229 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12231 split_loop_exit_edge (e, true);
12232 if (dump_enabled_p ())
12233 dump_printf (MSG_NOTE, "split exit edge\n");
12236 /* Version the loop first, if required, so the profitability check
12237 comes first. */
12239 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
12241 class loop *sloop
12242 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
12243 sloop->force_vectorize = false;
12244 check_profitability = false;
12247 /* Make sure there exists a single-predecessor exit bb also on the
12248 scalar loop copy. Do this after versioning but before peeling
12249 so CFG structure is fine for both scalar and if-converted loop
12250 to make slpeel_duplicate_current_defs_from_edges face matched
12251 loop closed PHI nodes on the exit. */
12252 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
12254 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
12255 if (! single_pred_p (e->dest))
12257 split_loop_exit_edge (e, true);
12258 if (dump_enabled_p ())
12259 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
12263 tree niters = vect_build_loop_niters (loop_vinfo);
12264 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
12265 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
12266 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
12267 tree advance;
12268 drs_init_vec orig_drs_init;
12270 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
12271 &step_vector, &niters_vector_mult_vf, th,
12272 check_profitability, niters_no_overflow,
12273 &advance);
12274 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
12275 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
12277 /* Ifcvt duplicates loop preheader, loop body and produces an basic
12278 block after loop exit. We need to scale all that. */
12279 basic_block preheader
12280 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
12281 preheader->count
12282 = preheader->count.apply_probability
12283 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12284 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
12285 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12286 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
12289 if (niters_vector == NULL_TREE)
12291 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
12292 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12293 && known_eq (lowest_vf, vf))
12295 niters_vector
12296 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
12297 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
12298 step_vector = build_one_cst (TREE_TYPE (niters));
12300 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12301 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
12302 &step_vector, niters_no_overflow);
12303 else
12304 /* vect_do_peeling subtracted the number of peeled prologue
12305 iterations from LOOP_VINFO_NITERS. */
12306 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
12307 &niters_vector, &step_vector,
12308 niters_no_overflow);
12311 /* 1) Make sure the loop header has exactly two entries
12312 2) Make sure we have a preheader basic block. */
12314 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
12316 split_edge (loop_preheader_edge (loop));
12318 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12319 /* This will deal with any possible peeling. */
12320 vect_prepare_for_masked_peels (loop_vinfo);
12322 /* Handle any code motion that we need to for early-break vectorization after
12323 we've done peeling but just before we start vectorizing. */
12324 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12325 move_early_exit_stmts (loop_vinfo);
12327 /* Schedule the SLP instances first, then handle loop vectorization
12328 below. */
12329 if (!loop_vinfo->slp_instances.is_empty ())
12331 DUMP_VECT_SCOPE ("scheduling SLP instances");
12332 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
12335 /* FORNOW: the vectorizer supports only loops which body consist
12336 of one basic block (header + empty latch). When the vectorizer will
12337 support more involved loop forms, the order by which the BBs are
12338 traversed need to be reconsidered. */
12340 for (i = 0; i < nbbs; i++)
12342 basic_block bb = bbs[i];
12343 stmt_vec_info stmt_info;
12345 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12346 gsi_next (&si))
12348 gphi *phi = si.phi ();
12349 if (dump_enabled_p ())
12350 dump_printf_loc (MSG_NOTE, vect_location,
12351 "------>vectorizing phi: %G", (gimple *) phi);
12352 stmt_info = loop_vinfo->lookup_stmt (phi);
12353 if (!stmt_info)
12354 continue;
12356 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12357 vect_loop_kill_debug_uses (loop, stmt_info);
12359 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12360 && !STMT_VINFO_LIVE_P (stmt_info))
12361 continue;
12363 if (STMT_VINFO_VECTYPE (stmt_info)
12364 && (maybe_ne
12365 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12366 && dump_enabled_p ())
12367 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12369 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12370 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12371 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12372 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12373 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12374 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12375 && ! PURE_SLP_STMT (stmt_info))
12377 if (dump_enabled_p ())
12378 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12379 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12383 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12384 gsi_next (&si))
12386 gphi *phi = si.phi ();
12387 stmt_info = loop_vinfo->lookup_stmt (phi);
12388 if (!stmt_info)
12389 continue;
12391 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12392 && !STMT_VINFO_LIVE_P (stmt_info))
12393 continue;
12395 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12396 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12397 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12398 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12399 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12400 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12401 && ! PURE_SLP_STMT (stmt_info))
12402 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12405 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12406 !gsi_end_p (si);)
12408 stmt = gsi_stmt (si);
12409 /* During vectorization remove existing clobber stmts and
12410 prefetches. */
12411 if (gimple_clobber_p (stmt)
12412 || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
12414 unlink_stmt_vdef (stmt);
12415 gsi_remove (&si, true);
12416 release_defs (stmt);
12418 else
12420 /* Ignore vector stmts created in the outer loop. */
12421 stmt_info = loop_vinfo->lookup_stmt (stmt);
12423 /* vector stmts created in the outer-loop during vectorization of
12424 stmts in an inner-loop may not have a stmt_info, and do not
12425 need to be vectorized. */
12426 stmt_vec_info seen_store = NULL;
12427 if (stmt_info)
12429 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12431 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12432 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12433 !gsi_end_p (subsi); gsi_next (&subsi))
12435 stmt_vec_info pat_stmt_info
12436 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12437 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12438 &si, &seen_store);
12440 stmt_vec_info pat_stmt_info
12441 = STMT_VINFO_RELATED_STMT (stmt_info);
12442 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12443 &si, &seen_store))
12444 maybe_set_vectorized_backedge_value (loop_vinfo,
12445 pat_stmt_info);
12447 else
12449 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12450 &seen_store))
12451 maybe_set_vectorized_backedge_value (loop_vinfo,
12452 stmt_info);
12455 gsi_next (&si);
12456 if (seen_store)
12458 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12459 /* Interleaving. If IS_STORE is TRUE, the
12460 vectorization of the interleaving chain was
12461 completed - free all the stores in the chain. */
12462 vect_remove_stores (loop_vinfo,
12463 DR_GROUP_FIRST_ELEMENT (seen_store));
12464 else
12465 /* Free the attached stmt_vec_info and remove the stmt. */
12466 loop_vinfo->remove_stmt (stmt_info);
12471 /* Stub out scalar statements that must not survive vectorization.
12472 Doing this here helps with grouped statements, or statements that
12473 are involved in patterns. */
12474 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12475 !gsi_end_p (gsi); gsi_next (&gsi))
12477 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12478 if (!call || !gimple_call_internal_p (call))
12479 continue;
12480 internal_fn ifn = gimple_call_internal_fn (call);
12481 if (ifn == IFN_MASK_LOAD)
12483 tree lhs = gimple_get_lhs (call);
12484 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12486 tree zero = build_zero_cst (TREE_TYPE (lhs));
12487 gimple *new_stmt = gimple_build_assign (lhs, zero);
12488 gsi_replace (&gsi, new_stmt, true);
12491 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12493 tree lhs = gimple_get_lhs (call);
12494 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12496 tree else_arg
12497 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12498 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12499 gsi_replace (&gsi, new_stmt, true);
12503 } /* BBs in loop */
12505 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12506 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12507 if (integer_onep (step_vector))
12508 niters_no_overflow = true;
12509 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12510 niters_vector, step_vector, niters_vector_mult_vf,
12511 !niters_no_overflow);
12513 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12515 /* True if the final iteration might not handle a full vector's
12516 worth of scalar iterations. */
12517 bool final_iter_may_be_partial
12518 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12519 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12521 /* +1 to convert latch counts to loop iteration counts. */
12522 int bias_for_lowest = 1;
12524 /* When we are peeling for gaps then we take away one scalar iteration
12525 from the vector loop. Thus we can adjust the upper bound by one
12526 scalar iteration. But only when we know the bound applies to the
12527 IV exit test which might not be true when we have multiple exits. */
12528 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12529 bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12531 int bias_for_assumed = bias_for_lowest;
12532 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12533 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12535 /* When the amount of peeling is known at compile time, the first
12536 iteration will have exactly alignment_npeels active elements.
12537 In the worst case it will have at least one. */
12538 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12539 bias_for_lowest += lowest_vf - min_first_active;
12540 bias_for_assumed += assumed_vf - min_first_active;
12542 /* In these calculations the "- 1" converts loop iteration counts
12543 back to latch counts. */
12544 if (loop->any_upper_bound)
12546 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12547 loop->nb_iterations_upper_bound
12548 = (final_iter_may_be_partial
12549 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12550 lowest_vf) - 1
12551 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12552 lowest_vf) - 1);
12553 if (main_vinfo
12554 /* Both peeling for alignment and peeling for gaps can end up
12555 with the scalar epilogue running for more than VF-1 iterations. */
12556 && !main_vinfo->peeling_for_alignment
12557 && !main_vinfo->peeling_for_gaps)
12559 unsigned int bound;
12560 poly_uint64 main_iters
12561 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12562 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12563 main_iters
12564 = upper_bound (main_iters,
12565 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12566 if (can_div_away_from_zero_p (main_iters,
12567 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12568 &bound))
12569 loop->nb_iterations_upper_bound
12570 = wi::umin ((bound_wide_int) (bound - 1),
12571 loop->nb_iterations_upper_bound);
12574 if (loop->any_likely_upper_bound)
12575 loop->nb_iterations_likely_upper_bound
12576 = (final_iter_may_be_partial
12577 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12578 + bias_for_lowest, lowest_vf) - 1
12579 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12580 + bias_for_lowest, lowest_vf) - 1);
12581 if (loop->any_estimate)
12582 loop->nb_iterations_estimate
12583 = (final_iter_may_be_partial
12584 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12585 assumed_vf) - 1
12586 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12587 assumed_vf) - 1);
12588 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12589 assumed_vf, flat);
12591 if (dump_enabled_p ())
12593 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12595 dump_printf_loc (MSG_NOTE, vect_location,
12596 "LOOP VECTORIZED\n");
12597 if (loop->inner)
12598 dump_printf_loc (MSG_NOTE, vect_location,
12599 "OUTER LOOP VECTORIZED\n");
12600 dump_printf (MSG_NOTE, "\n");
12602 else
12603 dump_printf_loc (MSG_NOTE, vect_location,
12604 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12605 GET_MODE_NAME (loop_vinfo->vector_mode));
12608 /* Loops vectorized with a variable factor won't benefit from
12609 unrolling/peeling. */
12610 if (!vf.is_constant ())
12612 loop->unroll = 1;
12613 if (dump_enabled_p ())
12614 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12615 " variable-length vectorization factor\n");
12617 /* Free SLP instances here because otherwise stmt reference counting
12618 won't work. */
12619 slp_instance instance;
12620 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12621 vect_free_slp_instance (instance);
12622 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12623 /* Clear-up safelen field since its value is invalid after vectorization
12624 since vectorized loop can have loop-carried dependencies. */
12625 loop->safelen = 0;
12627 if (epilogue)
12629 update_epilogue_loop_vinfo (epilogue, advance);
12631 epilogue->simduid = loop->simduid;
12632 epilogue->force_vectorize = loop->force_vectorize;
12633 epilogue->dont_vectorize = false;
12636 return epilogue;
12639 /* The code below is trying to perform simple optimization - revert
12640 if-conversion for masked stores, i.e. if the mask of a store is zero
12641 do not perform it and all stored value producers also if possible.
12642 For example,
12643 for (i=0; i<n; i++)
12644 if (c[i])
12646 p1[i] += 1;
12647 p2[i] = p3[i] +2;
12649 this transformation will produce the following semi-hammock:
12651 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12653 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12654 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12655 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12656 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12657 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12658 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12662 void
12663 optimize_mask_stores (class loop *loop)
12665 basic_block *bbs = get_loop_body (loop);
12666 unsigned nbbs = loop->num_nodes;
12667 unsigned i;
12668 basic_block bb;
12669 class loop *bb_loop;
12670 gimple_stmt_iterator gsi;
12671 gimple *stmt;
12672 auto_vec<gimple *> worklist;
12673 auto_purge_vect_location sentinel;
12675 vect_location = find_loop_location (loop);
12676 /* Pick up all masked stores in loop if any. */
12677 for (i = 0; i < nbbs; i++)
12679 bb = bbs[i];
12680 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12681 gsi_next (&gsi))
12683 stmt = gsi_stmt (gsi);
12684 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12685 worklist.safe_push (stmt);
12689 free (bbs);
12690 if (worklist.is_empty ())
12691 return;
12693 /* Loop has masked stores. */
12694 while (!worklist.is_empty ())
12696 gimple *last, *last_store;
12697 edge e, efalse;
12698 tree mask;
12699 basic_block store_bb, join_bb;
12700 gimple_stmt_iterator gsi_to;
12701 tree vdef, new_vdef;
12702 gphi *phi;
12703 tree vectype;
12704 tree zero;
12706 last = worklist.pop ();
12707 mask = gimple_call_arg (last, 2);
12708 bb = gimple_bb (last);
12709 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12710 the same loop as if_bb. It could be different to LOOP when two
12711 level loop-nest is vectorized and mask_store belongs to the inner
12712 one. */
12713 e = split_block (bb, last);
12714 bb_loop = bb->loop_father;
12715 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12716 join_bb = e->dest;
12717 store_bb = create_empty_bb (bb);
12718 add_bb_to_loop (store_bb, bb_loop);
12719 e->flags = EDGE_TRUE_VALUE;
12720 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12721 /* Put STORE_BB to likely part. */
12722 efalse->probability = profile_probability::likely ();
12723 e->probability = efalse->probability.invert ();
12724 store_bb->count = efalse->count ();
12725 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12726 if (dom_info_available_p (CDI_DOMINATORS))
12727 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12728 if (dump_enabled_p ())
12729 dump_printf_loc (MSG_NOTE, vect_location,
12730 "Create new block %d to sink mask stores.",
12731 store_bb->index);
12732 /* Create vector comparison with boolean result. */
12733 vectype = TREE_TYPE (mask);
12734 zero = build_zero_cst (vectype);
12735 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12736 gsi = gsi_last_bb (bb);
12737 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12738 /* Create new PHI node for vdef of the last masked store:
12739 .MEM_2 = VDEF <.MEM_1>
12740 will be converted to
12741 .MEM.3 = VDEF <.MEM_1>
12742 and new PHI node will be created in join bb
12743 .MEM_2 = PHI <.MEM_1, .MEM_3>
12745 vdef = gimple_vdef (last);
12746 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12747 gimple_set_vdef (last, new_vdef);
12748 phi = create_phi_node (vdef, join_bb);
12749 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12751 /* Put all masked stores with the same mask to STORE_BB if possible. */
12752 while (true)
12754 gimple_stmt_iterator gsi_from;
12755 gimple *stmt1 = NULL;
12757 /* Move masked store to STORE_BB. */
12758 last_store = last;
12759 gsi = gsi_for_stmt (last);
12760 gsi_from = gsi;
12761 /* Shift GSI to the previous stmt for further traversal. */
12762 gsi_prev (&gsi);
12763 gsi_to = gsi_start_bb (store_bb);
12764 gsi_move_before (&gsi_from, &gsi_to);
12765 /* Setup GSI_TO to the non-empty block start. */
12766 gsi_to = gsi_start_bb (store_bb);
12767 if (dump_enabled_p ())
12768 dump_printf_loc (MSG_NOTE, vect_location,
12769 "Move stmt to created bb\n%G", last);
12770 /* Move all stored value producers if possible. */
12771 while (!gsi_end_p (gsi))
12773 tree lhs;
12774 imm_use_iterator imm_iter;
12775 use_operand_p use_p;
12776 bool res;
12778 /* Skip debug statements. */
12779 if (is_gimple_debug (gsi_stmt (gsi)))
12781 gsi_prev (&gsi);
12782 continue;
12784 stmt1 = gsi_stmt (gsi);
12785 /* Do not consider statements writing to memory or having
12786 volatile operand. */
12787 if (gimple_vdef (stmt1)
12788 || gimple_has_volatile_ops (stmt1))
12789 break;
12790 gsi_from = gsi;
12791 gsi_prev (&gsi);
12792 lhs = gimple_get_lhs (stmt1);
12793 if (!lhs)
12794 break;
12796 /* LHS of vectorized stmt must be SSA_NAME. */
12797 if (TREE_CODE (lhs) != SSA_NAME)
12798 break;
12800 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12802 /* Remove dead scalar statement. */
12803 if (has_zero_uses (lhs))
12805 gsi_remove (&gsi_from, true);
12806 continue;
12810 /* Check that LHS does not have uses outside of STORE_BB. */
12811 res = true;
12812 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12814 gimple *use_stmt;
12815 use_stmt = USE_STMT (use_p);
12816 if (is_gimple_debug (use_stmt))
12817 continue;
12818 if (gimple_bb (use_stmt) != store_bb)
12820 res = false;
12821 break;
12824 if (!res)
12825 break;
12827 if (gimple_vuse (stmt1)
12828 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12829 break;
12831 /* Can move STMT1 to STORE_BB. */
12832 if (dump_enabled_p ())
12833 dump_printf_loc (MSG_NOTE, vect_location,
12834 "Move stmt to created bb\n%G", stmt1);
12835 gsi_move_before (&gsi_from, &gsi_to);
12836 /* Shift GSI_TO for further insertion. */
12837 gsi_prev (&gsi_to);
12839 /* Put other masked stores with the same mask to STORE_BB. */
12840 if (worklist.is_empty ()
12841 || gimple_call_arg (worklist.last (), 2) != mask
12842 || worklist.last () != stmt1)
12843 break;
12844 last = worklist.pop ();
12846 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12850 /* Decide whether it is possible to use a zero-based induction variable
12851 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12852 the value that the induction variable must be able to hold in order
12853 to ensure that the rgroups eventually have no active vector elements.
12854 Return -1 otherwise. */
12856 widest_int
12857 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12859 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12860 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12861 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12863 /* Calculate the value that the induction variable must be able
12864 to hit in order to ensure that we end the loop with an all-false mask.
12865 This involves adding the maximum number of inactive trailing scalar
12866 iterations. */
12867 widest_int iv_limit = -1;
12868 if (max_loop_iterations (loop, &iv_limit))
12870 if (niters_skip)
12872 /* Add the maximum number of skipped iterations to the
12873 maximum iteration count. */
12874 if (TREE_CODE (niters_skip) == INTEGER_CST)
12875 iv_limit += wi::to_widest (niters_skip);
12876 else
12877 iv_limit += max_vf - 1;
12879 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12880 /* Make a conservatively-correct assumption. */
12881 iv_limit += max_vf - 1;
12883 /* IV_LIMIT is the maximum number of latch iterations, which is also
12884 the maximum in-range IV value. Round this value down to the previous
12885 vector alignment boundary and then add an extra full iteration. */
12886 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12887 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12889 return iv_limit;
12892 /* For the given rgroup_controls RGC, check whether an induction variable
12893 would ever hit a value that produces a set of all-false masks or zero
12894 lengths before wrapping around. Return true if it's possible to wrap
12895 around before hitting the desirable value, otherwise return false. */
12897 bool
12898 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12900 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12902 if (iv_limit == -1)
12903 return true;
12905 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12906 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12907 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12909 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12910 return true;
12912 return false;