1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
37 #include "pretty-print.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 #include "gimple-fold.h"
58 #include "optabs-query.h"
60 /* Return true if load- or store-lanes optab OPTAB is implemented for
61 COUNT vectors of type VECTYPE. NAME is the name of OPTAB.
63 If it is implemented and ELSVALS is nonzero store the possible else
64 values in the vector it points to. */
67 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
68 tree vectype
, unsigned HOST_WIDE_INT count
,
69 vec
<int> *elsvals
= nullptr)
71 machine_mode mode
, array_mode
;
74 mode
= TYPE_MODE (vectype
);
75 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
77 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
78 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
79 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
83 "no array mode for %s[%wu]\n",
84 GET_MODE_NAME (mode
), count
);
90 if ((icode
= convert_optab_handler (optab
, array_mode
, mode
))
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
95 "cannot use %s<%s><%s>\n", name
,
96 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_NOTE
, vect_location
,
102 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
103 GET_MODE_NAME (mode
));
106 get_supported_else_vals (icode
,
107 internal_fn_else_index (IFN_MASK_LEN_LOAD_LANES
),
113 /* Helper function to identify a simd clone call. If this is a call to a
114 function with simd clones then return the corresponding cgraph_node,
115 otherwise return NULL. */
118 simd_clone_call_p (gimple
*stmt
)
120 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
124 tree fndecl
= NULL_TREE
;
125 if (gimple_call_internal_p (call
, IFN_MASK_CALL
))
126 fndecl
= TREE_OPERAND (gimple_call_arg (stmt
, 0), 0);
128 fndecl
= gimple_call_fndecl (stmt
);
130 if (fndecl
== NULL_TREE
)
133 cgraph_node
*node
= cgraph_node::get (fndecl
);
134 if (node
&& node
->simd_clones
!= NULL
)
142 /* Return the smallest scalar part of STMT_INFO.
143 This is used to determine the vectype of the stmt. We generally set the
144 vectype according to the type of the result (lhs). For stmts whose
145 result-type is different than the type of the arguments (e.g., demotion,
146 promotion), vectype will be reset appropriately (later). Note that we have
147 to visit the smallest datatype in this function, because that determines the
148 VF. If the smallest datatype in the loop is present only as the rhs of a
149 promotion operation - we'd miss it.
150 Such a case, where a variable of this datatype does not appear in the lhs
151 anywhere in the loop, can only occur if it's an invariant: e.g.:
152 'int_x = (int) short_inv', which we'd expect to have been optimized away by
153 invariant motion. However, we cannot rely on invariant motion to always
154 take invariants out of the loop, and so in the case of promotion we also
155 have to check the rhs.
156 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
160 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
, tree scalar_type
)
162 HOST_WIDE_INT lhs
, rhs
;
164 /* During the analysis phase, this function is called on arbitrary
165 statements that might not have scalar results. */
166 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
169 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
171 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
174 scalar_type
= TREE_TYPE (gimple_assign_lhs (assign
));
175 if (gimple_assign_cast_p (assign
)
176 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
177 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
178 || gimple_assign_rhs_code (assign
) == SAD_EXPR
179 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
180 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_PLUS_EXPR
181 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_MINUS_EXPR
182 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
183 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
185 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
187 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
189 scalar_type
= rhs_type
;
192 else if (cgraph_node
*node
= simd_clone_call_p (stmt_info
->stmt
))
194 auto clone
= node
->simd_clones
->simdclone
;
195 for (unsigned int i
= 0; i
< clone
->nargs
; ++i
)
197 if (clone
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
199 tree arg_scalar_type
= TREE_TYPE (clone
->args
[i
].vector_type
);
200 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type
));
203 scalar_type
= arg_scalar_type
;
209 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
212 if (gimple_call_internal_p (call
))
214 internal_fn ifn
= gimple_call_internal_fn (call
);
215 if (internal_load_fn_p (ifn
))
216 /* For loads the LHS type does the trick. */
218 else if (internal_store_fn_p (ifn
))
220 /* For stores use the tyep of the stored value. */
221 i
= internal_fn_stored_value_index (ifn
);
222 scalar_type
= TREE_TYPE (gimple_call_arg (call
, i
));
225 else if (internal_fn_mask_index (ifn
) == 0)
228 if (i
< gimple_call_num_args (call
))
230 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
231 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
233 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
235 scalar_type
= rhs_type
;
244 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
245 tested at run-time. Return TRUE if DDR was successfully inserted.
246 Return false if versioning is not supported. */
249 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
251 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
253 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
254 return opt_result::failure_at (vect_location
,
255 "will not create alias checks, as"
256 " --param vect-max-version-for-alias-checks"
260 = runtime_alias_check_p (ddr
, loop
,
261 optimize_loop_nest_for_speed_p (loop
));
265 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
266 return opt_result::success ();
269 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
272 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
274 const vec
<tree
> &checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
275 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
276 if (checks
[i
] == value
)
279 if (dump_enabled_p ())
280 dump_printf_loc (MSG_NOTE
, vect_location
,
281 "need run-time check that %T is nonzero\n",
283 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
286 /* Return true if we know that the order of vectorized DR_INFO_A and
287 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
288 DR_INFO_B. At least one of the accesses is a write. */
291 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
293 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
294 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
296 /* Single statements are always kept in their original order. */
297 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
298 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
301 /* If there is a loop invariant read involved we might vectorize it in
302 the prologue, breaking scalar oder with respect to the in-loop store. */
303 if ((DR_IS_READ (dr_info_a
->dr
) && integer_zerop (DR_STEP (dr_info_a
->dr
)))
304 || (DR_IS_READ (dr_info_b
->dr
) && integer_zerop (DR_STEP (dr_info_b
->dr
))))
307 /* STMT_A and STMT_B belong to overlapping groups. All loads are
308 emitted at the position of the first scalar load.
309 Stores in a group are emitted at the position of the last scalar store.
310 Compute that position and check whether the resulting order matches
312 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
315 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
316 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
317 s
= DR_GROUP_NEXT_ELEMENT (s
))
318 il_a
= get_later_stmt (il_a
, s
);
319 else /* DR_IS_READ */
320 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
321 s
= DR_GROUP_NEXT_ELEMENT (s
))
322 if (get_later_stmt (il_a
, s
) == il_a
)
327 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
330 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
331 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
332 s
= DR_GROUP_NEXT_ELEMENT (s
))
333 il_b
= get_later_stmt (il_b
, s
);
334 else /* DR_IS_READ */
335 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
336 s
= DR_GROUP_NEXT_ELEMENT (s
))
337 if (get_later_stmt (il_b
, s
) == il_b
)
342 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
343 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
346 /* A subroutine of vect_analyze_data_ref_dependence. Handle
347 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
348 distances. These distances are conservatively correct but they don't
349 reflect a guaranteed dependence.
351 Return true if this function does all the work necessary to avoid
352 an alias or false if the caller should use the dependence distances
353 to limit the vectorization factor in the usual way. LOOP_DEPTH is
354 the depth of the loop described by LOOP_VINFO and the other arguments
355 are as for vect_analyze_data_ref_dependence. */
358 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
359 loop_vec_info loop_vinfo
,
360 int loop_depth
, unsigned int *max_vf
)
362 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
363 for (lambda_vector
&dist_v
: DDR_DIST_VECTS (ddr
))
365 int dist
= dist_v
[loop_depth
];
366 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
368 /* If the user asserted safelen >= DIST consecutive iterations
369 can be executed concurrently, assume independence.
371 ??? An alternative would be to add the alias check even
372 in this case, and vectorize the fallback loop with the
373 maximum VF set to safelen. However, if the user has
374 explicitly given a length, it's less likely that that
376 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
378 if ((unsigned int) loop
->safelen
< *max_vf
)
379 *max_vf
= loop
->safelen
;
380 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
384 /* For dependence distances of 2 or more, we have the option
385 of limiting VF or checking for an alias at runtime.
386 Prefer to check at runtime if we can, to avoid limiting
387 the VF unnecessarily when the bases are in fact independent.
389 Note that the alias checks will be removed if the VF ends up
390 being small enough. */
391 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
392 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
393 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
394 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
395 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
402 /* Function vect_analyze_data_ref_dependence.
404 FIXME: I needed to change the sense of the returned flag.
406 Return FALSE if there (might) exist a dependence between a memory-reference
407 DRA and a memory-reference DRB. When versioning for alias may check a
408 dependence at run-time, return TRUE. Adjust *MAX_VF according to
409 the data dependence. */
412 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
413 loop_vec_info loop_vinfo
,
414 unsigned int *max_vf
)
417 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
418 struct data_reference
*dra
= DDR_A (ddr
);
419 struct data_reference
*drb
= DDR_B (ddr
);
420 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
421 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
422 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
423 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
424 lambda_vector dist_v
;
425 unsigned int loop_depth
;
427 /* If user asserted safelen consecutive iterations can be
428 executed concurrently, assume independence. */
429 auto apply_safelen
= [&]()
431 if (loop
->safelen
>= 2)
433 if ((unsigned int) loop
->safelen
< *max_vf
)
434 *max_vf
= loop
->safelen
;
435 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
441 /* In loop analysis all data references should be vectorizable. */
442 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
443 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
446 /* Independent data accesses. */
447 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
448 return opt_result::success ();
451 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
452 return opt_result::success ();
454 /* We do not have to consider dependences between accesses that belong
455 to the same group, unless the stride could be smaller than the
457 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
458 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
459 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
460 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
461 return opt_result::success ();
463 /* Even if we have an anti-dependence then, as the vectorized loop covers at
464 least two scalar iterations, there is always also a true dependence.
465 As the vectorizer does not re-order loads and stores we can ignore
466 the anti-dependence if TBAA can disambiguate both DRs similar to the
467 case with known negative distance anti-dependences (positive
468 distance anti-dependences would violate TBAA constraints). */
469 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
470 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
471 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
472 get_alias_set (DR_REF (drb
))))
473 return opt_result::success ();
475 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
476 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
478 if (apply_safelen ())
479 return opt_result::success ();
481 return opt_result::failure_at
483 "possible alias involving gather/scatter between %T and %T\n",
484 DR_REF (dra
), DR_REF (drb
));
487 /* Unknown data dependence. */
488 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
490 if (apply_safelen ())
491 return opt_result::success ();
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
495 "versioning for alias required: "
496 "can't determine dependence between %T and %T\n",
497 DR_REF (dra
), DR_REF (drb
));
499 /* Add to list of ddrs that need to be tested at run-time. */
500 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
503 /* Known data dependence. */
504 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
506 if (apply_safelen ())
507 return opt_result::success ();
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
511 "versioning for alias required: "
512 "bad dist vector for %T and %T\n",
513 DR_REF (dra
), DR_REF (drb
));
514 /* Add to list of ddrs that need to be tested at run-time. */
515 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
518 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
520 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
521 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
523 return opt_result::success ();
525 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
527 int dist
= dist_v
[loop_depth
];
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE
, vect_location
,
531 "dependence distance = %d.\n", dist
);
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE
, vect_location
,
537 "dependence distance == 0 between %T and %T\n",
538 DR_REF (dra
), DR_REF (drb
));
540 /* When we perform grouped accesses and perform implicit CSE
541 by detecting equal accesses and doing disambiguation with
542 runtime alias tests like for
550 where we will end up loading { a[i], a[i+1] } once, make
551 sure that inserting group loads before the first load and
552 stores after the last store will do the right thing.
553 Similar for groups like
557 where loads from the group interleave with the store. */
558 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
559 return opt_result::failure_at (stmtinfo_a
->stmt
,
560 "READ_WRITE dependence"
561 " in interleaving.\n");
563 if (loop
->safelen
< 2)
565 tree indicator
= dr_zero_step_indicator (dra
);
566 if (!indicator
|| integer_zerop (indicator
))
567 return opt_result::failure_at (stmtinfo_a
->stmt
,
568 "access also has a zero step\n");
569 else if (TREE_CODE (indicator
) != INTEGER_CST
)
570 vect_check_nonzero_value (loop_vinfo
, indicator
);
575 if (dist
> 0 && DDR_REVERSED_P (ddr
))
577 /* If DDR_REVERSED_P the order of the data-refs in DDR was
578 reversed (to make distance vector positive), and the actual
579 distance is negative. */
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE
, vect_location
,
582 "dependence distance negative.\n");
583 /* When doing outer loop vectorization, we need to check if there is
584 a backward dependence at the inner loop level if the dependence
585 at the outer loop is reversed. See PR81740. */
586 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
587 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
589 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
590 DDR_LOOP_NEST (ddr
));
591 if (dist_v
[inner_depth
] < 0)
592 return opt_result::failure_at (stmtinfo_a
->stmt
,
593 "not vectorized, dependence "
594 "between data-refs %T and %T\n",
595 DR_REF (dra
), DR_REF (drb
));
597 /* Record a negative dependence distance to later limit the
598 amount of stmt copying / unrolling we can perform.
599 Only need to handle read-after-write dependence. */
601 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
602 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
603 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
607 unsigned int abs_dist
= abs (dist
);
608 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
610 /* The dependence distance requires reduction of the maximal
611 vectorization factor. */
613 if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE
, vect_location
,
615 "adjusting maximal vectorization factor to %i\n",
619 if (abs_dist
>= *max_vf
)
621 /* Dependence distance does not create dependence, as far as
622 vectorization is concerned, in this case. */
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE
, vect_location
,
625 "dependence distance >= VF.\n");
629 return opt_result::failure_at (stmtinfo_a
->stmt
,
630 "not vectorized, possible dependence "
631 "between data-refs %T and %T\n",
632 DR_REF (dra
), DR_REF (drb
));
635 return opt_result::success ();
638 /* Function vect_analyze_early_break_dependences.
640 Examine all the data references in the loop and make sure that if we have
641 multiple exits that we are able to safely move stores such that they become
642 safe for vectorization. The function also calculates the place where to move
643 the instructions to and computes what the new vUSE chain should be.
645 This works in tandem with the CFG that will be produced by
646 slpeel_tree_duplicate_loop_to_edge_cfg later on.
648 This function tries to validate whether an early break vectorization
649 is possible for the current instruction sequence. Returns True i
650 possible, otherwise False.
653 - Any memory access must be to a fixed size buffer.
654 - There must not be any loads and stores to the same object.
655 - Multiple loads are allowed as long as they don't alias.
658 This implementation is very conservative. Any overlapping loads/stores
659 that take place before the early break statement gets rejected aside from
676 is which is the common case. */
679 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo
)
681 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
683 /* List of all load data references found during traversal. */
684 auto_vec
<data_reference
*> bases
;
685 basic_block dest_bb
= NULL
;
687 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
688 class loop
*loop_nest
= loop_outer (loop
);
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE
, vect_location
,
692 "loop contains multiple exits, analyzing"
693 " statement dependencies.\n");
695 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE
, vect_location
,
698 "alternate exit has been chosen as main exit.\n");
700 /* Since we don't support general control flow, the location we'll move the
701 side-effects to is always the latch connected exit. When we support
702 general control flow we can do better but for now this is fine. Move
703 side-effects to the in-loop destination of the last early exit. For the
704 PEELED case we move the side-effects to the latch block as this is
705 guaranteed to be the last block to be executed when a vector iteration
707 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
708 dest_bb
= loop
->latch
;
710 dest_bb
= single_pred (loop
->latch
);
712 /* We start looking from dest_bb, for the non-PEELED case we don't want to
713 move any stores already present, but we do want to read and validate the
715 basic_block bb
= dest_bb
;
717 /* We move stores across all loads to the beginning of dest_bb, so
718 the first block processed below doesn't need dependence checking. */
719 bool check_deps
= false;
723 gimple_stmt_iterator gsi
= gsi_last_bb (bb
);
725 /* Now analyze all the remaining statements and try to determine which
726 instructions are allowed/needed to be moved. */
727 while (!gsi_end_p (gsi
))
729 gimple
*stmt
= gsi_stmt (gsi
);
731 if (is_gimple_debug (stmt
))
734 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (stmt
);
735 auto dr_ref
= STMT_VINFO_DATA_REF (stmt_vinfo
);
739 /* We know everything below dest_bb is safe since we know we
740 had a full vector iteration when reaching it. Either by
741 the loop entry / IV exit test being last or because this
742 is the loop latch itself. */
746 /* Check if vector accesses to the object will be within bounds.
747 must be a constant or assume loop will be versioned or niters
748 bounded by VF so accesses are within range. We only need to check
749 the reads since writes are moved to a safe place where if we get
750 there we know they are safe to perform. */
751 if (DR_IS_READ (dr_ref
)
752 && !ref_within_array_bound (stmt
, DR_REF (dr_ref
)))
754 if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo
)
755 || STMT_VINFO_STRIDED_P (stmt_vinfo
))
758 = "early break not supported: cannot peel "
759 "for alignment, vectorization would read out of "
761 return opt_result::failure_at (stmt
, msg
, stmt
);
764 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_vinfo
);
765 dr_info
->need_peeling_for_alignment
= true;
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_NOTE
, vect_location
,
769 "marking DR (read) as needing peeling for "
770 "alignment at %G", stmt
);
773 if (DR_IS_READ (dr_ref
))
774 bases
.safe_push (dr_ref
);
775 else if (DR_IS_WRITE (dr_ref
))
777 /* We are moving writes down in the CFG. To be sure that this
778 is valid after vectorization we have to check all the loads
779 we are sinking the stores past to see if any of them may
780 alias or are the same object.
782 Same objects will not be an issue because unless the store
783 is marked volatile the value can be forwarded. If the
784 store is marked volatile we don't vectorize the loop
787 That leaves the check for aliasing. We don't really need
788 to care about the stores aliasing with each other since the
789 stores are moved in order so the effects are still observed
790 correctly. This leaves the check for WAR dependencies
791 which we would be introducing here if the DR can alias.
792 The check is quadratic in loads/stores but I have not found
793 a better API to do this. I believe all loads and stores
794 must be checked. We also must check them when we
795 encountered the store, since we don't care about loads past
798 for (auto dr_read
: bases
)
799 if (dr_may_alias_p (dr_ref
, dr_read
, loop_nest
))
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
804 "early breaks not supported: "
805 "overlapping loads and stores "
806 "found before the break "
809 return opt_result::failure_at (stmt
,
810 "can't safely apply code motion to dependencies"
811 " to vectorize the early exit. %G may alias with"
812 " %G\n", stmt
, dr_read
->stmt
);
816 if (gimple_vdef (stmt
))
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE
, vect_location
,
820 "==> recording stmt %G", stmt
);
822 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).safe_push (stmt
);
824 else if (gimple_vuse (stmt
))
826 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).safe_insert (0, stmt
);
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE
, vect_location
,
829 "marked statement for vUSE update: %G", stmt
);
833 if (!single_pred_p (bb
))
835 gcc_assert (bb
== loop
->header
);
839 /* If we possibly sink through a virtual PHI make sure to elide that. */
840 if (gphi
*vphi
= get_virtual_phi (bb
))
841 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo
).safe_push (vphi
);
843 /* All earlier blocks need dependence checking. */
845 bb
= single_pred (bb
);
849 /* We don't allow outer -> inner loop transitions which should have been
850 trapped already during loop form analysis. */
851 gcc_assert (dest_bb
->loop_father
== loop
);
853 /* Check that the destination block we picked has only one pred. To relax this we
854 have to take special care when moving the statements. We don't currently support
855 such control flow however this check is there to simplify how we handle
856 labels that may be present anywhere in the IL. This check is to ensure that the
857 labels aren't significant for the CFG. */
858 if (!single_pred (dest_bb
))
859 return opt_result::failure_at (vect_location
,
860 "chosen loop exit block (BB %d) does not have a "
861 "single predecessor which is currently not "
862 "supported for early break vectorization.\n",
865 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
) = dest_bb
;
867 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).is_empty ())
869 /* All uses shall be updated to that of the first load. Entries are
870 stored in reverse order. */
871 tree vuse
= gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
).last ());
872 for (auto g
: LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo
))
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE
, vect_location
,
876 "will update use: %T, mem_ref: %G", vuse
, g
);
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE
, vect_location
,
882 "recorded statements to be moved to BB %d\n",
883 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo
)->index
);
885 return opt_result::success ();
888 /* Function vect_analyze_data_ref_dependences.
890 Examine all the data references in the loop, and make sure there do not
891 exist any data dependences between them. Set *MAX_VF according to
892 the maximum vectorization factor the data dependences allow. */
895 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
896 unsigned int *max_vf
)
899 struct data_dependence_relation
*ddr
;
901 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
903 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
905 LOOP_VINFO_DDRS (loop_vinfo
)
906 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
907 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
908 /* We do not need read-read dependences. */
909 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
910 &LOOP_VINFO_DDRS (loop_vinfo
),
911 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
916 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
918 /* For epilogues we either have no aliases or alias versioning
919 was applied to original loop. Therefore we may just get max_vf
920 using VF of original loop. */
921 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
922 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
924 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
927 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
932 /* If we have early break statements in the loop, check to see if they
933 are of a form we can vectorizer. */
934 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo
))
935 return vect_analyze_early_break_dependences (loop_vinfo
);
937 return opt_result::success ();
941 /* Function vect_slp_analyze_data_ref_dependence.
943 Return TRUE if there (might) exist a dependence between a memory-reference
944 DRA and a memory-reference DRB for VINFO. When versioning for alias
945 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
946 according to the data dependence. */
949 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
950 struct data_dependence_relation
*ddr
)
952 struct data_reference
*dra
= DDR_A (ddr
);
953 struct data_reference
*drb
= DDR_B (ddr
);
954 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
955 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
957 /* We need to check dependences of statements marked as unvectorizable
958 as well, they still can prohibit vectorization. */
960 /* Independent data accesses. */
961 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
967 /* Read-read is OK. */
968 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
971 /* If dra and drb are part of the same interleaving chain consider
973 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
974 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
975 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
978 /* Unknown data dependence. */
979 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
983 "can't determine dependence between %T and %T\n",
984 DR_REF (dra
), DR_REF (drb
));
986 else if (dump_enabled_p ())
987 dump_printf_loc (MSG_NOTE
, vect_location
,
988 "determined dependence between %T and %T\n",
989 DR_REF (dra
), DR_REF (drb
));
995 /* Analyze dependences involved in the transform of a store SLP NODE. */
998 vect_slp_analyze_store_dependences (vec_info
*vinfo
, slp_tree node
)
1000 /* This walks over all stmts involved in the SLP store done
1001 in NODE verifying we can sink them up to the last stmt in the
1003 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
1004 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info
)));
1006 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
1008 stmt_vec_info access_info
1009 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
1010 if (access_info
== last_access_info
)
1012 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
1014 bool ref_initialized_p
= false;
1015 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
1016 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
1018 gimple
*stmt
= gsi_stmt (gsi
);
1019 if (! gimple_vuse (stmt
))
1022 /* If we couldn't record a (single) data reference for this
1023 stmt we have to resort to the alias oracle. */
1024 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
1025 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
1028 /* We are moving a store - this means
1029 we cannot use TBAA for disambiguation. */
1030 if (!ref_initialized_p
)
1031 ao_ref_init (&ref
, DR_REF (dr_a
));
1032 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
1033 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
1038 gcc_assert (!gimple_visited_p (stmt
));
1040 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
1042 bool dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1043 free_dependence_relation (ddr
);
1051 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1052 contain the vector of scalar stores of this instance if we are
1053 disambiguating the loads. */
1056 vect_slp_analyze_load_dependences (vec_info
*vinfo
, slp_tree node
,
1057 vec
<stmt_vec_info
> stores
,
1058 stmt_vec_info last_store_info
)
1060 /* This walks over all stmts involved in the SLP load done
1061 in NODE verifying we can hoist them up to the first stmt in the
1063 stmt_vec_info first_access_info
= vect_find_first_scalar_stmt_in_slp (node
);
1064 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info
)));
1066 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
1068 if (! SLP_TREE_SCALAR_STMTS (node
)[k
])
1070 stmt_vec_info access_info
1071 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
1072 if (access_info
== first_access_info
)
1074 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
1076 bool ref_initialized_p
= false;
1077 hash_set
<stmt_vec_info
> grp_visited
;
1078 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
1079 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
1081 gimple
*stmt
= gsi_stmt (gsi
);
1082 if (! gimple_vdef (stmt
))
1085 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
1087 /* If we run into a store of this same instance (we've just
1088 marked those) then delay dependence checking until we run
1089 into the last store because this is where it will have
1090 been sunk to (and we verified that we can do that already). */
1091 if (gimple_visited_p (stmt
))
1093 if (stmt_info
!= last_store_info
)
1096 for (stmt_vec_info
&store_info
: stores
)
1098 data_reference
*store_dr
= STMT_VINFO_DATA_REF (store_info
);
1099 ddr_p ddr
= initialize_data_dependence_relation
1100 (dr_a
, store_dr
, vNULL
);
1102 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1103 free_dependence_relation (ddr
);
1110 auto check_hoist
= [&] (stmt_vec_info stmt_info
) -> bool
1112 /* We are hoisting a load - this means we can use TBAA for
1114 if (!ref_initialized_p
)
1115 ao_ref_init (&ref
, DR_REF (dr_a
));
1116 if (stmt_may_clobber_ref_p_1 (stmt_info
->stmt
, &ref
, true))
1118 /* If we couldn't record a (single) data reference for this
1119 stmt we have to give up now. */
1120 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
1123 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
1126 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
1127 free_dependence_relation (ddr
);
1131 /* No dependence. */
1134 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1136 /* When we run into a store group we have to honor
1137 that earlier stores might be moved here. We don't
1138 know exactly which and where to since we lack a
1139 back-mapping from DR to SLP node, so assume all
1140 earlier stores are sunk here. It's enough to
1141 consider the last stmt of a group for this.
1142 ??? Both this and the fact that we disregard that
1143 the conflicting instance might be removed later
1144 is overly conservative. */
1145 if (!grp_visited
.add (DR_GROUP_FIRST_ELEMENT (stmt_info
)))
1146 for (auto store_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1148 store_info
= DR_GROUP_NEXT_ELEMENT (store_info
))
1149 if ((store_info
== stmt_info
1150 || get_later_stmt (store_info
, stmt_info
) == stmt_info
)
1151 && !check_hoist (store_info
))
1156 if (!check_hoist (stmt_info
))
1165 /* Function vect_analyze_data_ref_dependences.
1167 Examine all the data references in the basic-block, and make sure there
1168 do not exist any data dependences between them. Set *MAX_VF according to
1169 the maximum vectorization factor the data dependences allow. */
1172 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
1174 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1176 /* The stores of this instance are at the root of the SLP tree. */
1177 slp_tree store
= NULL
;
1178 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
)
1179 store
= SLP_INSTANCE_TREE (instance
);
1181 /* Verify we can sink stores to the vectorized stmt insert location. */
1182 stmt_vec_info last_store_info
= NULL
;
1185 if (! vect_slp_analyze_store_dependences (vinfo
, store
))
1188 /* Mark stores in this instance and remember the last one. */
1189 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
1190 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1191 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
1196 /* Verify we can sink loads to the vectorized stmt insert location,
1197 special-casing stores of this instance. */
1198 for (slp_tree
&load
: SLP_INSTANCE_LOADS (instance
))
1199 if (! vect_slp_analyze_load_dependences (vinfo
, load
,
1201 ? SLP_TREE_SCALAR_STMTS (store
)
1202 : vNULL
, last_store_info
))
1208 /* Unset the visited flag. */
1210 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
1211 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
1216 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1220 dr_misalignment (dr_vec_info
*dr_info
, tree vectype
, poly_int64 offset
)
1222 HOST_WIDE_INT diff
= 0;
1223 /* Alignment is only analyzed for the first element of a DR group,
1224 use that but adjust misalignment by the offset of the access. */
1225 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
))
1227 dr_vec_info
*first_dr
1228 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
));
1229 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1230 INTEGER_CSTs and the first element in the group has the lowest
1232 diff
= (TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
))
1233 - TREE_INT_CST_LOW (DR_INIT (first_dr
->dr
)));
1234 gcc_assert (diff
>= 0);
1238 int misalign
= dr_info
->misalignment
;
1239 gcc_assert (misalign
!= DR_MISALIGNMENT_UNINITIALIZED
);
1240 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
1243 /* If the access is only aligned for a vector type with smaller alignment
1244 requirement the access has unknown misalignment. */
1245 if (maybe_lt (dr_info
->target_alignment
* BITS_PER_UNIT
,
1246 targetm
.vectorize
.preferred_vector_alignment (vectype
)))
1247 return DR_MISALIGNMENT_UNKNOWN
;
1249 /* Apply the offset from the DR group start and the externally supplied
1250 offset which can for example result from a negative stride access. */
1251 poly_int64 misalignment
= misalign
+ diff
+ offset
;
1253 /* Below we reject compile-time non-constant target alignments, but if
1254 our misalignment is zero, then we are known to already be aligned
1255 w.r.t. any such possible target alignment. */
1256 if (known_eq (misalignment
, 0))
1259 unsigned HOST_WIDE_INT target_alignment_c
;
1260 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1261 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1262 return DR_MISALIGNMENT_UNKNOWN
;
1266 /* Record the base alignment guarantee given by DRB, which occurs
1270 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
1271 innermost_loop_behavior
*drb
)
1274 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> &entry
1275 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
1276 if (!existed
|| entry
.second
->base_alignment
< drb
->base_alignment
)
1278 entry
= std::make_pair (stmt_info
, drb
);
1279 if (dump_enabled_p ())
1280 dump_printf_loc (MSG_NOTE
, vect_location
,
1281 "recording new base alignment for %T\n"
1283 " misalignment: %d\n"
1286 drb
->base_alignment
,
1287 drb
->base_misalignment
,
1292 /* If the region we're going to vectorize is reached, all unconditional
1293 data references occur at least once. We can therefore pool the base
1294 alignment guarantees from each unconditional reference. Do this by
1295 going through all the data references in VINFO and checking whether
1296 the containing statement makes the reference unconditionally. If so,
1297 record the alignment of the base address in VINFO so that it can be
1298 used for all other references with the same base. */
1301 vect_record_base_alignments (vec_info
*vinfo
)
1303 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1304 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
1305 for (data_reference
*dr
: vinfo
->shared
->datarefs
)
1307 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
1308 stmt_vec_info stmt_info
= dr_info
->stmt
;
1309 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
1310 && STMT_VINFO_VECTORIZABLE (stmt_info
)
1311 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1313 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
1315 /* If DR is nested in the loop that is being vectorized, we can also
1316 record the alignment of the base wrt the outer loop. */
1317 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
1318 vect_record_base_alignment
1319 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
1324 /* Function vect_compute_data_ref_alignment
1326 Compute the misalignment of the data reference DR_INFO when vectorizing
1329 RESULT is non-NULL iff VINFO is a loop_vec_info. In that case, *RESULT will
1330 be set appropriately on failure (but is otherwise left unchanged).
1333 1. initialized misalignment info for DR_INFO
1335 FOR NOW: No analysis is actually performed. Misalignment is calculated
1336 only for trivial cases. TODO. */
1339 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1340 tree vectype
, opt_result
*result
= nullptr)
1342 stmt_vec_info stmt_info
= dr_info
->stmt
;
1343 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
1344 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1345 class loop
*loop
= NULL
;
1346 tree ref
= DR_REF (dr_info
->dr
);
1348 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_NOTE
, vect_location
,
1350 "vect_compute_data_ref_alignment:\n");
1353 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1355 /* Initialize misalignment to unknown. */
1356 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1358 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
1361 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
1362 bool step_preserves_misalignment_p
;
1364 poly_uint64 vector_alignment
1365 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
1368 /* If this DR needs peeling for alignment for correctness, we must
1369 ensure the target alignment is a constant power-of-two multiple of the
1370 amount read per vector iteration (overriding the above hook where
1372 if (dr_info
->need_peeling_for_alignment
)
1374 /* Vector size in bytes. */
1375 poly_uint64 safe_align
= tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype
));
1377 /* We can only peel for loops, of course. */
1378 gcc_checking_assert (loop_vinfo
);
1380 /* Calculate the number of vectors read per vector iteration. If
1381 it is a power of two, multiply through to get the required
1382 alignment in bytes. Otherwise, fail analysis since alignment
1383 peeling wouldn't work in such a case. */
1384 poly_uint64 num_scalars
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1385 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1386 num_scalars
*= DR_GROUP_SIZE (stmt_info
);
1388 auto num_vectors
= vect_get_num_vectors (num_scalars
, vectype
);
1389 if (!pow2p_hwi (num_vectors
))
1391 *result
= opt_result::failure_at (vect_location
,
1392 "non-power-of-two num vectors %u "
1393 "for DR needing peeling for "
1395 num_vectors
, stmt_info
->stmt
);
1399 safe_align
*= num_vectors
;
1400 if (maybe_gt (safe_align
, 4096U))
1403 pp_wide_integer (&pp
, safe_align
);
1404 *result
= opt_result::failure_at (vect_location
,
1405 "alignment required for correctness"
1406 " (%s) may exceed page size",
1407 pp_formatted_text (&pp
));
1411 unsigned HOST_WIDE_INT multiple
;
1412 if (!constant_multiple_p (vector_alignment
, safe_align
, &multiple
)
1413 || !pow2p_hwi (multiple
))
1415 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE
, vect_location
,
1418 "forcing alignment for DR from preferred (");
1419 dump_dec (MSG_NOTE
, vector_alignment
);
1420 dump_printf (MSG_NOTE
, ") to safe align (");
1421 dump_dec (MSG_NOTE
, safe_align
);
1422 dump_printf (MSG_NOTE
, ") for stmt: %G", stmt_info
->stmt
);
1424 vector_alignment
= safe_align
;
1428 SET_DR_TARGET_ALIGNMENT (dr_info
, vector_alignment
);
1430 /* If the main loop has peeled for alignment we have no way of knowing
1431 whether the data accesses in the epilogues are aligned. We can't at
1432 compile time answer the question whether we have entered the main loop or
1433 not. Fixes PR 92351. */
1436 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1438 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1442 unsigned HOST_WIDE_INT vect_align_c
;
1443 if (!vector_alignment
.is_constant (&vect_align_c
))
1446 /* No step for BB vectorization. */
1449 gcc_assert (integer_zerop (drb
->step
));
1450 step_preserves_misalignment_p
= true;
1455 /* We can only use base and misalignment information relative to
1456 an innermost loop if the misalignment stays the same throughout the
1457 execution of the loop. As above, this is the case if the stride of
1458 the dataref evenly divides by the alignment. */
1459 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1460 step_preserves_misalignment_p
1461 = multiple_p (drb
->step_alignment
* vf
, vect_align_c
);
1463 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1465 "step doesn't divide the vector alignment.\n");
1467 /* In case the dataref is in an inner-loop of the loop that is being
1468 vectorized (LOOP), we use the base and misalignment information
1469 relative to the outer-loop (LOOP). This is ok only if the
1470 misalignment stays the same throughout the execution of the
1471 inner-loop, which is why we have to check that the stride of the
1472 dataref in the inner-loop evenly divides by the vector alignment. */
1473 if (step_preserves_misalignment_p
1474 && nested_in_vect_loop_p (loop
, stmt_info
))
1476 step_preserves_misalignment_p
1477 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1479 if (dump_enabled_p ())
1481 if (step_preserves_misalignment_p
)
1482 dump_printf_loc (MSG_NOTE
, vect_location
,
1483 "inner step divides the vector alignment.\n");
1485 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1486 "inner step doesn't divide the vector"
1492 unsigned int base_alignment
= drb
->base_alignment
;
1493 unsigned int base_misalignment
= drb
->base_misalignment
;
1495 /* Calculate the maximum of the pooled base address alignment and the
1496 alignment that we can compute for DR itself. */
1497 std::pair
<stmt_vec_info
, innermost_loop_behavior
*> *entry
1498 = base_alignments
->get (drb
->base_address
);
1500 && base_alignment
< (*entry
).second
->base_alignment
1502 || (dominated_by_p (CDI_DOMINATORS
, gimple_bb (stmt_info
->stmt
),
1503 gimple_bb (entry
->first
->stmt
))
1504 && (gimple_bb (stmt_info
->stmt
) != gimple_bb (entry
->first
->stmt
)
1505 || (entry
->first
->dr_aux
.group
<= dr_info
->group
)))))
1507 base_alignment
= entry
->second
->base_alignment
;
1508 base_misalignment
= entry
->second
->base_misalignment
;
1511 if (drb
->offset_alignment
< vect_align_c
1512 || !step_preserves_misalignment_p
1513 /* We need to know whether the step wrt the vectorized loop is
1514 negative when computing the starting misalignment below. */
1515 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1517 if (dump_enabled_p ())
1518 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1519 "Unknown alignment for access: %T\n", ref
);
1523 if (base_alignment
< vect_align_c
)
1525 unsigned int max_alignment
;
1526 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1527 if (max_alignment
< vect_align_c
1528 || !vect_can_force_dr_alignment_p (base
,
1529 vect_align_c
* BITS_PER_UNIT
))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_NOTE
, vect_location
,
1533 "can't force alignment of ref: %T\n", ref
);
1537 /* Force the alignment of the decl.
1538 NOTE: This is the only change to the code we make during
1539 the analysis phase, before deciding to vectorize the loop. */
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE
, vect_location
,
1542 "force alignment of %T\n", ref
);
1544 dr_info
->base_decl
= base
;
1545 dr_info
->base_misaligned
= true;
1546 base_misalignment
= 0;
1548 poly_int64 misalignment
1549 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1551 unsigned int const_misalignment
;
1552 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1556 "Non-constant misalignment for access: %T\n", ref
);
1560 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1564 "misalign = %d bytes of ref %T\n",
1565 const_misalignment
, ref
);
1570 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1571 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1572 is made aligned via peeling. */
1575 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1576 dr_vec_info
*dr_peel_info
)
1578 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1579 DR_TARGET_ALIGNMENT (dr_info
)))
1581 poly_offset_int diff
1582 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1583 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1584 if (known_eq (diff
, 0)
1585 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1591 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1592 aligned via peeling. */
1595 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1596 dr_vec_info
*dr_peel_info
)
1598 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1599 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1600 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1601 DR_OFFSET (dr_peel_info
->dr
), 0)
1602 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1603 DR_STEP (dr_peel_info
->dr
), 0))
1606 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1609 /* Compute the value for dr_info->misalign so that the access appears
1610 aligned. This is used by peeling to compensate for dr_misalignment
1611 applying the offset for negative step. */
1614 vect_dr_misalign_for_aligned_access (dr_vec_info
*dr_info
)
1616 if (tree_int_cst_sgn (DR_STEP (dr_info
->dr
)) >= 0)
1619 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1620 poly_int64 misalignment
1621 = ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1622 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1624 unsigned HOST_WIDE_INT target_alignment_c
;
1626 if (!dr_info
->target_alignment
.is_constant (&target_alignment_c
)
1627 || !known_misalignment (misalignment
, target_alignment_c
, &misalign
))
1628 return DR_MISALIGNMENT_UNKNOWN
;
1632 /* Function vect_update_misalignment_for_peel.
1633 Sets DR_INFO's misalignment
1634 - to 0 if it has the same alignment as DR_PEEL_INFO,
1635 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1636 - to -1 (unknown) otherwise.
1638 DR_INFO - the data reference whose misalignment is to be adjusted.
1639 DR_PEEL_INFO - the data reference whose misalignment is being made
1640 zero in the vector loop by the peel.
1641 NPEEL - the number of iterations in the peel loop if the misalignment
1642 of DR_PEEL_INFO is known at compile time. */
1645 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1646 dr_vec_info
*dr_peel_info
, int npeel
)
1648 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1649 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1651 SET_DR_MISALIGNMENT (dr_info
,
1652 vect_dr_misalign_for_aligned_access (dr_peel_info
));
1656 unsigned HOST_WIDE_INT alignment
;
1657 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1658 && known_alignment_for_access_p (dr_info
,
1659 STMT_VINFO_VECTYPE (dr_info
->stmt
))
1660 && known_alignment_for_access_p (dr_peel_info
,
1661 STMT_VINFO_VECTYPE (dr_peel_info
->stmt
)))
1663 int misal
= dr_info
->misalignment
;
1664 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1665 misal
&= alignment
- 1;
1666 set_dr_misalignment (dr_info
, misal
);
1670 if (dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1672 "to unknown (-1).\n");
1673 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1676 /* Return true if alignment is relevant for DR_INFO. */
1679 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1681 stmt_vec_info stmt_info
= dr_info
->stmt
;
1683 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1686 /* For interleaving, only the alignment of the first access matters. */
1687 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1688 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1691 /* Scatter-gather and invariant accesses continue to address individual
1692 scalars, so vector-level alignment is irrelevant. */
1693 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1694 || integer_zerop (DR_STEP (dr_info
->dr
)))
1697 /* Strided accesses perform only component accesses, alignment is
1698 irrelevant for them. */
1699 if (STMT_VINFO_STRIDED_P (stmt_info
)
1700 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1706 /* Given an memory reference EXP return whether its alignment is less
1710 not_size_aligned (tree exp
)
1712 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1715 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1716 > get_object_alignment (exp
));
1719 /* Function vector_alignment_reachable_p
1721 Return true if vector alignment for DR_INFO is reachable by peeling
1722 a few loop iterations. Return false otherwise. */
1725 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1727 stmt_vec_info stmt_info
= dr_info
->stmt
;
1728 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1730 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1732 /* For interleaved access we peel only if number of iterations in
1733 the prolog loop ({VF - misalignment}), is a multiple of the
1734 number of the interleaved accesses. */
1735 int elem_size
, mis_in_elements
;
1737 /* FORNOW: handle only known alignment. */
1738 if (!known_alignment_for_access_p (dr_info
, vectype
))
1741 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1742 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1743 elem_size
= vector_element_size (vector_size
, nelements
);
1744 mis_in_elements
= dr_misalignment (dr_info
, vectype
) / elem_size
;
1746 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1750 /* If misalignment is known at the compile time then allow peeling
1751 only if natural alignment is reachable through peeling. */
1752 if (known_alignment_for_access_p (dr_info
, vectype
)
1753 && !aligned_access_p (dr_info
, vectype
))
1755 HOST_WIDE_INT elmsize
=
1756 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1757 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_NOTE
, vect_location
,
1760 "data size = %wd. misalignment = %d.\n", elmsize
,
1761 dr_misalignment (dr_info
, vectype
));
1763 if (dr_misalignment (dr_info
, vectype
) % elmsize
)
1765 if (dump_enabled_p ())
1766 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1767 "data size does not divide the misalignment.\n");
1772 if (!known_alignment_for_access_p (dr_info
, vectype
))
1774 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1775 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1776 if (dump_enabled_p ())
1777 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1778 "Unknown misalignment, %snaturally aligned\n",
1779 is_packed
? "not " : "");
1780 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1787 /* Calculate the cost of the memory access represented by DR_INFO. */
1790 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1791 dr_alignment_support alignment_support_scheme
,
1793 unsigned int *inside_cost
,
1794 unsigned int *outside_cost
,
1795 stmt_vector_for_cost
*body_cost_vec
,
1796 stmt_vector_for_cost
*prologue_cost_vec
)
1798 stmt_vec_info stmt_info
= dr_info
->stmt
;
1799 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1802 if (PURE_SLP_STMT (stmt_info
))
1805 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1807 if (DR_IS_READ (dr_info
->dr
))
1808 vect_get_load_cost (vinfo
, stmt_info
, NULL
, ncopies
,
1809 alignment_support_scheme
, misalignment
, true,
1810 inside_cost
, outside_cost
, prologue_cost_vec
,
1811 body_cost_vec
, false);
1813 vect_get_store_cost (vinfo
,stmt_info
, NULL
, ncopies
,
1814 alignment_support_scheme
, misalignment
, inside_cost
,
1817 if (dump_enabled_p ())
1818 dump_printf_loc (MSG_NOTE
, vect_location
,
1819 "vect_get_data_access_cost: inside_cost = %d, "
1820 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1824 typedef struct _vect_peel_info
1826 dr_vec_info
*dr_info
;
1831 typedef struct _vect_peel_extended_info
1834 struct _vect_peel_info peel_info
;
1835 unsigned int inside_cost
;
1836 unsigned int outside_cost
;
1837 } *vect_peel_extended_info
;
1840 /* Peeling hashtable helpers. */
1842 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1844 static inline hashval_t
hash (const _vect_peel_info
*);
1845 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1849 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1851 return (hashval_t
) peel_info
->npeel
;
1855 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1857 return (a
->npeel
== b
->npeel
);
1861 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1864 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1865 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1866 int npeel
, bool supportable_if_not_aligned
)
1868 struct _vect_peel_info elem
, *slot
;
1869 _vect_peel_info
**new_slot
;
1872 slot
= peeling_htab
->find (&elem
);
1877 slot
= XNEW (struct _vect_peel_info
);
1878 slot
->npeel
= npeel
;
1879 slot
->dr_info
= dr_info
;
1881 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1885 /* If this DR is not supported with unknown misalignment then bias
1886 this slot when the cost model is disabled. */
1887 if (!supportable_if_not_aligned
1888 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1889 slot
->count
+= VECT_MAX_COST
;
1893 /* Traverse peeling hash table to find peeling option that aligns maximum
1894 number of data accesses. */
1897 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1898 _vect_peel_extended_info
*max
)
1900 vect_peel_info elem
= *slot
;
1902 if (elem
->count
> max
->peel_info
.count
1903 || (elem
->count
== max
->peel_info
.count
1904 && max
->peel_info
.npeel
> elem
->npeel
))
1906 max
->peel_info
.npeel
= elem
->npeel
;
1907 max
->peel_info
.count
= elem
->count
;
1908 max
->peel_info
.dr_info
= elem
->dr_info
;
1914 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1915 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1916 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1920 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1921 dr_vec_info
*dr0_info
,
1922 unsigned int *inside_cost
,
1923 unsigned int *outside_cost
,
1924 stmt_vector_for_cost
*body_cost_vec
,
1925 stmt_vector_for_cost
*prologue_cost_vec
,
1928 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1930 bool dr0_alignment_known_p
1932 && known_alignment_for_access_p (dr0_info
,
1933 STMT_VINFO_VECTYPE (dr0_info
->stmt
)));
1935 for (data_reference
*dr
: datarefs
)
1937 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1938 if (!vect_relevant_for_alignment_p (dr_info
))
1941 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
1942 dr_alignment_support alignment_support_scheme
;
1944 unsigned HOST_WIDE_INT alignment
;
1946 bool negative
= tree_int_cst_compare (DR_STEP (dr_info
->dr
),
1947 size_zero_node
) < 0;
1950 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1951 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1954 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1955 else if (dr_info
== dr0_info
1956 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
1958 else if (!dr0_alignment_known_p
1959 || !known_alignment_for_access_p (dr_info
, vectype
)
1960 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
1961 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
1964 misalignment
= dr_misalignment (dr_info
, vectype
, off
);
1965 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1966 misalignment
&= alignment
- 1;
1968 alignment_support_scheme
1969 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
1972 vect_get_data_access_cost (loop_vinfo
, dr_info
,
1973 alignment_support_scheme
, misalignment
,
1974 inside_cost
, outside_cost
,
1975 body_cost_vec
, prologue_cost_vec
);
1979 /* Traverse peeling hash table and calculate cost for each peeling option.
1980 Find the one with the lowest cost. */
1983 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1984 _vect_peel_extended_info
*min
)
1986 vect_peel_info elem
= *slot
;
1988 unsigned int inside_cost
= 0, outside_cost
= 0;
1989 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1990 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1993 prologue_cost_vec
.create (2);
1994 body_cost_vec
.create (2);
1995 epilogue_cost_vec
.create (2);
1997 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1998 &outside_cost
, &body_cost_vec
,
1999 &prologue_cost_vec
, elem
->npeel
);
2001 body_cost_vec
.release ();
2003 outside_cost
+= vect_get_known_peeling_cost
2004 (loop_vinfo
, elem
->npeel
, &dummy
,
2005 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2006 &prologue_cost_vec
, &epilogue_cost_vec
);
2008 /* Prologue and epilogue costs are added to the target model later.
2009 These costs depend only on the scalar iteration cost, the
2010 number of peeling iterations finally chosen, and the number of
2011 misaligned statements. So discard the information found here. */
2012 prologue_cost_vec
.release ();
2013 epilogue_cost_vec
.release ();
2015 if (inside_cost
< min
->inside_cost
2016 || (inside_cost
== min
->inside_cost
2017 && outside_cost
< min
->outside_cost
))
2019 min
->inside_cost
= inside_cost
;
2020 min
->outside_cost
= outside_cost
;
2021 min
->peel_info
.dr_info
= elem
->dr_info
;
2022 min
->peel_info
.npeel
= elem
->npeel
;
2023 min
->peel_info
.count
= elem
->count
;
2030 /* Choose best peeling option by traversing peeling hash table and either
2031 choosing an option with the lowest cost (if cost model is enabled) or the
2032 option that aligns as many accesses as possible. */
2034 static struct _vect_peel_extended_info
2035 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
2036 loop_vec_info loop_vinfo
)
2038 struct _vect_peel_extended_info res
;
2040 res
.peel_info
.dr_info
= NULL
;
2041 res
.vinfo
= loop_vinfo
;
2043 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2045 res
.inside_cost
= INT_MAX
;
2046 res
.outside_cost
= INT_MAX
;
2047 peeling_htab
->traverse
<_vect_peel_extended_info
*,
2048 vect_peeling_hash_get_lowest_cost
> (&res
);
2052 res
.peel_info
.count
= 0;
2053 peeling_htab
->traverse
<_vect_peel_extended_info
*,
2054 vect_peeling_hash_get_most_frequent
> (&res
);
2055 res
.inside_cost
= 0;
2056 res
.outside_cost
= 0;
2062 /* Return true if the new peeling NPEEL is supported. */
2065 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
2068 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2069 enum dr_alignment_support supportable_dr_alignment
;
2071 bool dr0_alignment_known_p
2072 = known_alignment_for_access_p (dr0_info
,
2073 STMT_VINFO_VECTYPE (dr0_info
->stmt
));
2075 /* Ensure that all data refs can be vectorized after the peel. */
2076 for (data_reference
*dr
: datarefs
)
2078 if (dr
== dr0_info
->dr
)
2081 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2082 if (!vect_relevant_for_alignment_p (dr_info
)
2083 || vect_dr_aligned_if_peeled_dr_is (dr_info
, dr0_info
))
2086 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2088 unsigned HOST_WIDE_INT alignment
;
2089 if (!dr0_alignment_known_p
2090 || !known_alignment_for_access_p (dr_info
, vectype
)
2091 || !DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
))
2092 misalignment
= DR_MISALIGNMENT_UNKNOWN
;
2095 misalignment
= dr_misalignment (dr_info
, vectype
);
2096 misalignment
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
2097 misalignment
&= alignment
- 1;
2099 supportable_dr_alignment
2100 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2102 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2109 /* Compare two data-references DRA and DRB to group them into chunks
2110 with related alignment. */
2113 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
2115 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2116 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2119 /* Stabilize sort. */
2123 /* Ordering of DRs according to base. */
2124 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2125 DR_BASE_ADDRESS (drb
));
2129 /* And according to DR_OFFSET. */
2130 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2134 /* And after step. */
2135 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2139 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2140 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2142 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2146 /* Function vect_enhance_data_refs_alignment
2148 This pass will use loop versioning and loop peeling in order to enhance
2149 the alignment of data references in the loop.
2151 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2152 original loop is to be vectorized. Any other loops that are created by
2153 the transformations performed in this pass - are not supposed to be
2154 vectorized. This restriction will be relaxed.
2156 This pass will require a cost model to guide it whether to apply peeling
2157 or versioning or a combination of the two. For example, the scheme that
2158 intel uses when given a loop with several memory accesses, is as follows:
2159 choose one memory access ('p') which alignment you want to force by doing
2160 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2161 other accesses are not necessarily aligned, or (2) use loop versioning to
2162 generate one loop in which all accesses are aligned, and another loop in
2163 which only 'p' is necessarily aligned.
2165 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2166 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2167 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2169 Devising a cost model is the most critical aspect of this work. It will
2170 guide us on which access to peel for, whether to use loop versioning, how
2171 many versions to create, etc. The cost model will probably consist of
2172 generic considerations as well as target specific considerations (on
2173 powerpc for example, misaligned stores are more painful than misaligned
2176 Here are the general steps involved in alignment enhancements:
2178 -- original loop, before alignment analysis:
2179 for (i=0; i<N; i++){
2180 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2181 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2184 -- After vect_compute_data_refs_alignment:
2185 for (i=0; i<N; i++){
2186 x = q[i]; # DR_MISALIGNMENT(q) = 3
2187 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2190 -- Possibility 1: we do loop versioning:
2192 for (i=0; i<N; i++){ # loop 1A
2193 x = q[i]; # DR_MISALIGNMENT(q) = 3
2194 p[i] = y; # DR_MISALIGNMENT(p) = 0
2198 for (i=0; i<N; i++){ # loop 1B
2199 x = q[i]; # DR_MISALIGNMENT(q) = 3
2200 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2204 -- Possibility 2: we do loop peeling:
2205 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2209 for (i = 3; i < N; i++){ # loop 2A
2210 x = q[i]; # DR_MISALIGNMENT(q) = 0
2211 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2214 -- Possibility 3: combination of loop peeling and versioning:
2215 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2220 for (i = 3; i<N; i++){ # loop 3A
2221 x = q[i]; # DR_MISALIGNMENT(q) = 0
2222 p[i] = y; # DR_MISALIGNMENT(p) = 0
2226 for (i = 3; i<N; i++){ # loop 3B
2227 x = q[i]; # DR_MISALIGNMENT(q) = 0
2228 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2232 These loops are later passed to loop_transform to be vectorized. The
2233 vectorizer will use the alignment information to guide the transformation
2234 (whether to generate regular loads/stores, or with special handling for
2238 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
2240 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2241 dr_vec_info
*first_store
= NULL
;
2242 dr_vec_info
*dr0_info
= NULL
;
2243 struct data_reference
*dr
;
2245 bool do_peeling
= false;
2246 bool do_versioning
= false;
2247 unsigned int npeel
= 0;
2248 bool one_misalignment_known
= false;
2249 bool one_misalignment_unknown
= false;
2250 bool one_dr_unsupportable
= false;
2251 dr_vec_info
*unsupportable_dr_info
= NULL
;
2252 unsigned int dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
2253 hash_table
<peel_info_hasher
> peeling_htab (1);
2255 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2257 /* Reset data so we can safely be called multiple times. */
2258 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2259 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
2261 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
2262 return opt_result::success ();
2264 /* Sort the vector of datarefs so DRs that have the same or dependent
2265 alignment are next to each other. */
2266 auto_vec
<data_reference_p
> datarefs
2267 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
2268 datarefs
.qsort (dr_align_group_sort_cmp
);
2270 /* Compute the number of DRs that become aligned when we peel
2271 a dataref so it becomes aligned. */
2272 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
2273 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
2275 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
2276 if (DR_BASE_ADDRESS (datarefs
[i0
]))
2278 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
2280 if (i
== datarefs
.length ()
2281 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
2282 DR_BASE_ADDRESS (datarefs
[i
]), 0)
2283 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
2284 DR_OFFSET (datarefs
[i
]), 0)
2285 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
2286 DR_STEP (datarefs
[i
]), 0))
2288 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2289 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2290 will get known misalignment if we align one of the refs
2291 with the largest DR_TARGET_ALIGNMENT. */
2292 for (unsigned j
= i0
; j
< i
; ++j
)
2294 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
2295 for (unsigned k
= i0
; k
< i
; ++k
)
2299 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
2300 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
2302 n_same_align_refs
[j
]++;
2309 /* While cost model enhancements are expected in the future, the high level
2310 view of the code at this time is as follows:
2312 A) If there is a misaligned access then see if peeling to align
2313 this access can make all data references satisfy
2314 vect_supportable_dr_alignment. If so, update data structures
2315 as needed and return true.
2317 B) If peeling wasn't possible and there is a data reference with an
2318 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2319 then see if loop versioning checks can be used to make all data
2320 references satisfy vect_supportable_dr_alignment. If so, update
2321 data structures as needed and return true.
2323 C) If neither peeling nor versioning were successful then return false if
2324 any data reference does not satisfy vect_supportable_dr_alignment.
2326 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2328 Note, Possibility 3 above (which is peeling and versioning together) is not
2329 being done at this time. */
2331 /* (1) Peeling to force alignment. */
2333 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2335 + How many accesses will become aligned due to the peeling
2336 - How many accesses will become unaligned due to the peeling,
2337 and the cost of misaligned accesses.
2338 - The cost of peeling (the extra runtime checks, the increase
2341 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2343 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2344 if (!vect_relevant_for_alignment_p (dr_info
))
2347 stmt_vec_info stmt_info
= dr_info
->stmt
;
2348 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2349 do_peeling
= vector_alignment_reachable_p (dr_info
);
2352 if (known_alignment_for_access_p (dr_info
, vectype
))
2354 unsigned int npeel_tmp
= 0;
2355 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2356 size_zero_node
) < 0;
2358 /* If known_alignment_for_access_p then we have set
2359 DR_MISALIGNMENT which is only done if we know it at compiler
2360 time, so it is safe to assume target alignment is constant.
2362 unsigned int target_align
=
2363 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
2364 unsigned HOST_WIDE_INT dr_size
= vect_get_scalar_dr_size (dr_info
);
2367 off
= (TYPE_VECTOR_SUBPARTS (vectype
) - 1) * -dr_size
;
2368 unsigned int mis
= dr_misalignment (dr_info
, vectype
, off
);
2369 mis
= negative
? mis
: -mis
;
2371 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
2373 /* For multiple types, it is possible that the bigger type access
2374 will have more than one peeling option. E.g., a loop with two
2375 types: one of size (vector size / 4), and the other one of
2376 size (vector size / 8). Vectorization factor will 8. If both
2377 accesses are misaligned by 3, the first one needs one scalar
2378 iteration to be aligned, and the second one needs 5. But the
2379 first one will be aligned also by peeling 5 scalar
2380 iterations, and in that case both accesses will be aligned.
2381 Hence, except for the immediate peeling amount, we also want
2382 to try to add full vector size, while we don't exceed
2383 vectorization factor.
2384 We do this automatically for cost model, since we calculate
2385 cost for every peeling option. */
2386 poly_uint64 nscalars
= npeel_tmp
;
2387 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
2389 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2390 unsigned group_size
= 1;
2391 if (STMT_SLP_TYPE (stmt_info
)
2392 && STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2393 group_size
= DR_GROUP_SIZE (stmt_info
);
2394 nscalars
= vf
* group_size
;
2397 /* Save info about DR in the hash table. Also include peeling
2398 amounts according to the explanation above. Indicate
2399 the alignment status when the ref is not aligned.
2400 ??? Rather than using unknown alignment here we should
2401 prune all entries from the peeling hashtable which cause
2402 DRs to be not supported. */
2403 bool supportable_if_not_aligned
2404 = vect_supportable_dr_alignment
2405 (loop_vinfo
, dr_info
, vectype
, DR_MISALIGNMENT_UNKNOWN
);
2406 while (known_le (npeel_tmp
, nscalars
))
2408 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
2410 supportable_if_not_aligned
);
2411 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
2414 one_misalignment_known
= true;
2418 /* If we don't know any misalignment values, we prefer
2419 peeling for data-ref that has the maximum number of data-refs
2420 with the same alignment, unless the target prefers to align
2421 stores over load. */
2422 unsigned same_align_drs
= n_same_align_refs
[i
];
2424 || dr0_same_align_drs
< same_align_drs
)
2426 dr0_same_align_drs
= same_align_drs
;
2429 /* For data-refs with the same number of related
2430 accesses prefer the one where the misalign
2431 computation will be invariant in the outermost loop. */
2432 else if (dr0_same_align_drs
== same_align_drs
)
2434 class loop
*ivloop0
, *ivloop
;
2435 ivloop0
= outermost_invariant_loop_for_expr
2436 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
2437 ivloop
= outermost_invariant_loop_for_expr
2438 (loop
, DR_BASE_ADDRESS (dr
));
2439 if ((ivloop
&& !ivloop0
)
2440 || (ivloop
&& ivloop0
2441 && flow_loop_nested_p (ivloop
, ivloop0
)))
2445 one_misalignment_unknown
= true;
2447 /* Check for data refs with unsupportable alignment that
2449 enum dr_alignment_support supportable_dr_alignment
2450 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2451 DR_MISALIGNMENT_UNKNOWN
);
2452 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2454 one_dr_unsupportable
= true;
2455 unsupportable_dr_info
= dr_info
;
2458 if (!first_store
&& DR_IS_WRITE (dr
))
2460 first_store
= dr_info
;
2461 first_store_same_align_drs
= same_align_drs
;
2467 if (!aligned_access_p (dr_info
, vectype
))
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2471 "vector alignment may not be reachable\n");
2477 /* Check if we can possibly peel the loop. */
2478 if (!vect_can_advance_ivs_p (loop_vinfo
)
2479 || !slpeel_can_duplicate_loop_p (loop
, LOOP_VINFO_IV_EXIT (loop_vinfo
),
2480 loop_preheader_edge (loop
))
2482 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo
))
2485 struct _vect_peel_extended_info peel_for_known_alignment
;
2486 struct _vect_peel_extended_info peel_for_unknown_alignment
;
2487 struct _vect_peel_extended_info best_peel
;
2489 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
2490 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
2491 peel_for_unknown_alignment
.peel_info
.count
= 0;
2494 && one_misalignment_unknown
)
2496 /* Check if the target requires to prefer stores over loads, i.e., if
2497 misaligned stores are more expensive than misaligned loads (taking
2498 drs with same alignment into account). */
2499 unsigned int load_inside_cost
= 0;
2500 unsigned int load_outside_cost
= 0;
2501 unsigned int store_inside_cost
= 0;
2502 unsigned int store_outside_cost
= 0;
2503 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
2505 stmt_vector_for_cost dummy
;
2507 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
2510 &dummy
, &dummy
, estimated_npeels
);
2516 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
2518 &store_outside_cost
,
2525 store_inside_cost
= INT_MAX
;
2526 store_outside_cost
= INT_MAX
;
2529 if (load_inside_cost
> store_inside_cost
2530 || (load_inside_cost
== store_inside_cost
2531 && load_outside_cost
> store_outside_cost
))
2533 dr0_info
= first_store
;
2534 dr0_same_align_drs
= first_store_same_align_drs
;
2535 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2536 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2540 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2541 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2544 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2545 prologue_cost_vec
.create (2);
2546 epilogue_cost_vec
.create (2);
2549 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2550 (loop_vinfo
, estimated_npeels
, &dummy2
,
2551 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2552 &prologue_cost_vec
, &epilogue_cost_vec
);
2554 prologue_cost_vec
.release ();
2555 epilogue_cost_vec
.release ();
2557 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2560 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2561 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2563 best_peel
= peel_for_unknown_alignment
;
2565 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2566 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2567 peel_for_known_alignment
.peel_info
.count
= 0;
2568 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2570 if (do_peeling
&& one_misalignment_known
)
2572 /* Peeling is possible, but there is no data access that is not supported
2573 unless aligned. So we try to choose the best possible peeling from
2575 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2576 (&peeling_htab
, loop_vinfo
);
2579 /* Compare costs of peeling for known and unknown alignment. */
2580 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2581 && peel_for_unknown_alignment
.inside_cost
2582 >= peel_for_known_alignment
.inside_cost
)
2584 best_peel
= peel_for_known_alignment
;
2586 /* If the best peeling for known alignment has NPEEL == 0, perform no
2587 peeling at all except if there is an unsupportable dr that we can
2589 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2593 /* If there is an unsupportable data ref, prefer this over all choices so far
2594 since we'd have to discard a chosen peeling except when it accidentally
2595 aligned the unsupportable data ref. */
2596 if (one_dr_unsupportable
)
2597 dr0_info
= unsupportable_dr_info
;
2598 else if (do_peeling
)
2600 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2601 TODO: Use nopeel_outside_cost or get rid of it? */
2602 unsigned nopeel_inside_cost
= 0;
2603 unsigned nopeel_outside_cost
= 0;
2605 stmt_vector_for_cost dummy
;
2607 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2608 &nopeel_outside_cost
, &dummy
, &dummy
, 0);
2611 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2612 costs will be recorded. */
2613 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2614 prologue_cost_vec
.create (2);
2615 epilogue_cost_vec
.create (2);
2618 nopeel_outside_cost
+= vect_get_known_peeling_cost
2619 (loop_vinfo
, 0, &dummy2
,
2620 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2621 &prologue_cost_vec
, &epilogue_cost_vec
);
2623 prologue_cost_vec
.release ();
2624 epilogue_cost_vec
.release ();
2626 npeel
= best_peel
.peel_info
.npeel
;
2627 dr0_info
= best_peel
.peel_info
.dr_info
;
2629 /* If no peeling is not more expensive than the best peeling we
2630 have so far, don't perform any peeling. */
2631 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2637 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2638 if (known_alignment_for_access_p (dr0_info
,
2639 STMT_VINFO_VECTYPE (stmt_info
)))
2641 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2642 size_zero_node
) < 0;
2645 /* Since it's known at compile time, compute the number of
2646 iterations in the peeled loop (the peeling factor) for use in
2647 updating DR_MISALIGNMENT values. The peeling factor is the
2648 vectorization factor minus the misalignment as an element
2650 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2653 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2654 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2656 = dr_misalignment (dr0_info
, vectype
, off
);
2657 mis
= negative
? mis
: -mis
;
2658 /* If known_alignment_for_access_p then we have set
2659 DR_MISALIGNMENT which is only done if we know it at compiler
2660 time, so it is safe to assume target alignment is constant.
2662 unsigned int target_align
=
2663 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2664 npeel
= ((mis
& (target_align
- 1))
2665 / vect_get_scalar_dr_size (dr0_info
));
2668 /* For interleaved data access every iteration accesses all the
2669 members of the group, therefore we divide the number of iterations
2670 by the group size. */
2671 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2672 npeel
/= DR_GROUP_SIZE (stmt_info
);
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE
, vect_location
,
2676 "Try peeling by %d\n", npeel
);
2679 /* Ensure that all datarefs can be vectorized after the peel. */
2680 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2683 /* Check if all datarefs are supportable and log. */
2686 && known_alignment_for_access_p (dr0_info
,
2687 STMT_VINFO_VECTYPE (stmt_info
)))
2688 return opt_result::success ();
2690 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2693 unsigned max_allowed_peel
2694 = param_vect_max_peeling_for_alignment
;
2695 if (loop_cost_model (loop
) <= VECT_COST_MODEL_CHEAP
)
2696 max_allowed_peel
= 0;
2697 if (max_allowed_peel
!= (unsigned)-1)
2699 unsigned max_peel
= npeel
;
2702 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2703 unsigned HOST_WIDE_INT target_align_c
;
2704 if (target_align
.is_constant (&target_align_c
))
2706 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE
, vect_location
,
2712 "Disable peeling, max peels set and vector"
2713 " alignment unknown\n");
2716 if (max_peel
> max_allowed_peel
)
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE
, vect_location
,
2721 "Disable peeling, max peels reached: %d\n", max_peel
);
2726 /* Cost model #2 - if peeling may result in a remaining loop not
2727 iterating enough to be vectorized then do not peel. Since this
2728 is a cost heuristic rather than a correctness decision, use the
2729 most likely runtime value for variable vectorization factors. */
2731 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2733 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2734 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2735 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2736 < assumed_vf
+ max_peel
)
2742 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2743 If the misalignment of DR_i is identical to that of dr0 then set
2744 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2745 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2746 by the peeling factor times the element size of DR_i (MOD the
2747 vectorization factor times the size). Otherwise, the
2748 misalignment of DR_i must be set to unknown. */
2749 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2750 if (dr
!= dr0_info
->dr
)
2752 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2753 if (!vect_relevant_for_alignment_p (dr_info
))
2756 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2759 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2761 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2763 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = -1;
2764 SET_DR_MISALIGNMENT (dr0_info
,
2765 vect_dr_misalign_for_aligned_access (dr0_info
));
2766 if (dump_enabled_p ())
2768 dump_printf_loc (MSG_NOTE
, vect_location
,
2769 "Alignment of access forced using peeling.\n");
2770 dump_printf_loc (MSG_NOTE
, vect_location
,
2771 "Peeling for alignment will be applied.\n");
2774 /* The inside-loop cost will be accounted for in vectorizable_load
2775 and vectorizable_store correctly with adjusted alignments.
2776 Drop the body_cst_vec on the floor here. */
2777 return opt_result::success ();
2781 /* (2) Versioning to force alignment. */
2783 /* Try versioning if:
2784 1) optimize loop for speed and the cost-model is not cheap
2785 2) there is at least one unsupported misaligned data ref with an unknown
2787 3) all misaligned data refs with a known misalignment are supported, and
2788 4) the number of runtime alignment checks is within reason. */
2791 = (optimize_loop_nest_for_speed_p (loop
)
2792 && !loop
->inner
/* FORNOW */
2793 && loop_cost_model (loop
) > VECT_COST_MODEL_CHEAP
);
2797 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2799 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2800 if (!vect_relevant_for_alignment_p (dr_info
))
2803 stmt_vec_info stmt_info
= dr_info
->stmt
;
2804 if (STMT_VINFO_STRIDED_P (stmt_info
))
2806 do_versioning
= false;
2810 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2811 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
2812 size_zero_node
) < 0;
2815 off
= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
2816 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
2818 if ((misalignment
= dr_misalignment (dr_info
, vectype
, off
)) == 0)
2821 enum dr_alignment_support supportable_dr_alignment
2822 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, vectype
,
2824 if (supportable_dr_alignment
== dr_unaligned_unsupported
)
2826 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
2827 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2828 >= (unsigned) param_vect_max_version_for_alignment_checks
))
2830 do_versioning
= false;
2834 /* At present we don't support versioning for alignment
2835 with variable VF, since there's no guarantee that the
2836 VF is a power of two. We could relax this if we added
2837 a way of enforcing a power-of-two size. */
2838 unsigned HOST_WIDE_INT size
;
2839 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2841 do_versioning
= false;
2845 /* Forcing alignment in the first iteration is no good if
2846 we don't keep it across iterations. For now, just disable
2847 versioning in this case.
2848 ?? We could actually unroll the loop to achieve the required
2849 overall step alignment, and forcing the alignment could be
2850 done by doing some iterations of the non-vectorized loop. */
2851 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2852 * DR_STEP_ALIGNMENT (dr
),
2853 DR_TARGET_ALIGNMENT (dr_info
)))
2855 do_versioning
= false;
2859 /* The rightmost bits of an aligned address must be zeros.
2860 Construct the mask needed for this test. For example,
2861 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2862 mask must be 15 = 0xf. */
2863 int mask
= size
- 1;
2865 /* FORNOW: use the same mask to test all potentially unaligned
2866 references in the loop. */
2867 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2868 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2870 do_versioning
= false;
2874 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2875 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2879 /* Versioning requires at least one misaligned data reference. */
2880 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2881 do_versioning
= false;
2882 else if (!do_versioning
)
2883 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2888 const vec
<stmt_vec_info
> &may_misalign_stmts
2889 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2890 stmt_vec_info stmt_info
;
2892 /* It can now be assumed that the data references in the statements
2893 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2894 of the loop being vectorized. */
2895 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2897 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2898 SET_DR_MISALIGNMENT (dr_info
,
2899 vect_dr_misalign_for_aligned_access (dr_info
));
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_NOTE
, vect_location
,
2902 "Alignment of access forced using versioning.\n");
2905 if (dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE
, vect_location
,
2907 "Versioning for alignment will be applied.\n");
2909 /* Peeling and versioning can't be done together at this time. */
2910 gcc_assert (! (do_peeling
&& do_versioning
));
2912 return opt_result::success ();
2915 /* This point is reached if neither peeling nor versioning is being done. */
2916 gcc_assert (! (do_peeling
|| do_versioning
));
2918 return opt_result::success ();
2922 /* Function vect_analyze_data_refs_alignment
2924 Analyze the alignment of the data-references in the loop.
2925 Return FALSE if a data reference is found that cannot be vectorized. */
2928 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2930 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2932 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2933 struct data_reference
*dr
;
2936 vect_record_base_alignments (loop_vinfo
);
2937 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2939 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2940 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2942 if (STMT_VINFO_GROUPED_ACCESS (dr_info
->stmt
)
2943 && DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
) != dr_info
->stmt
)
2945 opt_result res
= opt_result::success ();
2946 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
,
2947 STMT_VINFO_VECTYPE (dr_info
->stmt
),
2954 return opt_result::success ();
2958 /* Analyze alignment of DRs of stmts in NODE. */
2961 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2963 /* Alignment is maintained in the first element of the group. */
2964 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2965 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2966 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2967 tree vectype
= SLP_TREE_VECTYPE (node
);
2968 poly_uint64 vector_alignment
2969 = exact_div (targetm
.vectorize
.preferred_vector_alignment (vectype
),
2971 if (dr_info
->misalignment
== DR_MISALIGNMENT_UNINITIALIZED
)
2972 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2973 /* Re-analyze alignment when we're facing a vectorization with a bigger
2974 alignment requirement. */
2975 else if (known_lt (dr_info
->target_alignment
, vector_alignment
))
2977 poly_uint64 old_target_alignment
= dr_info
->target_alignment
;
2978 int old_misalignment
= dr_info
->misalignment
;
2979 vect_compute_data_ref_alignment (vinfo
, dr_info
, SLP_TREE_VECTYPE (node
));
2980 /* But keep knowledge about a smaller alignment. */
2981 if (old_misalignment
!= DR_MISALIGNMENT_UNKNOWN
2982 && dr_info
->misalignment
== DR_MISALIGNMENT_UNKNOWN
)
2984 dr_info
->target_alignment
= old_target_alignment
;
2985 dr_info
->misalignment
= old_misalignment
;
2988 /* When we ever face unordered target alignments the first one wins in terms
2989 of analyzing and the other will become unknown in dr_misalignment. */
2993 /* Function vect_slp_analyze_instance_alignment
2995 Analyze the alignment of the data-references in the SLP instance.
2996 Return FALSE if a data reference is found that cannot be vectorized. */
2999 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
3000 slp_instance instance
)
3002 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
3006 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
3007 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
3010 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
3011 && ! vect_slp_analyze_node_alignment
3012 (vinfo
, SLP_INSTANCE_TREE (instance
)))
3019 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3020 accesses of legal size, step, etc. Detect gaps, single element
3021 interleaving, and other special cases. Set grouped access info.
3022 Collect groups of strided stores for further use in SLP analysis.
3023 Worker for vect_analyze_group_access. */
3026 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3028 data_reference
*dr
= dr_info
->dr
;
3029 tree step
= DR_STEP (dr
);
3030 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
3031 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
3032 stmt_vec_info stmt_info
= dr_info
->stmt
;
3033 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
3034 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
3035 HOST_WIDE_INT dr_step
= -1;
3036 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
3037 bool slp_impossible
= false;
3039 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
3040 size of the interleaving group (including gaps). */
3041 if (tree_fits_shwi_p (step
))
3043 dr_step
= tree_to_shwi (step
);
3044 /* Check that STEP is a multiple of type size. Otherwise there is
3045 a non-element-sized gap at the end of the group which we
3046 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
3047 ??? As we can handle non-constant step fine here we should
3048 simply remove uses of DR_GROUP_GAP between the last and first
3049 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
3050 simply not include that gap. */
3051 if ((dr_step
% type_size
) != 0)
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE
, vect_location
,
3055 "Step %T is not a multiple of the element size"
3060 groupsize
= absu_hwi (dr_step
) / type_size
;
3065 /* Not consecutive access is possible only if it is a part of interleaving. */
3066 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
3068 /* Check if it this DR is a part of interleaving, and is a single
3069 element of the group that is accessed in the loop. */
3071 /* Gaps are supported only for loads. STEP must be a multiple of the type
3074 && (dr_step
% type_size
) == 0
3076 /* This could be UINT_MAX but as we are generating code in a very
3077 inefficient way we have to cap earlier.
3078 See PR91403 for example. */
3079 && groupsize
<= 4096)
3081 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
3082 DR_GROUP_SIZE (stmt_info
) = groupsize
;
3083 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE
, vect_location
,
3086 "Detected single element interleaving %T"
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3095 "not consecutive access %G", stmt_info
->stmt
);
3099 /* Mark the statement as unvectorizable. */
3100 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
3106 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3110 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
3112 /* First stmt in the interleaving chain. Check the chain. */
3113 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3114 struct data_reference
*data_ref
= dr
;
3115 unsigned int count
= 1;
3116 tree prev_init
= DR_INIT (data_ref
);
3117 HOST_WIDE_INT diff
, gaps
= 0;
3119 /* By construction, all group members have INTEGER_CST DR_INITs. */
3122 /* We never have the same DR multiple times. */
3123 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
3124 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
3126 data_ref
= STMT_VINFO_DATA_REF (next
);
3128 /* All group members have the same STEP by construction. */
3129 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
3131 /* Check that the distance between two accesses is equal to the type
3132 size. Otherwise, we have gaps. */
3133 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
3134 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
3135 if (diff
< 1 || diff
> UINT_MAX
)
3137 /* For artificial testcases with array accesses with large
3138 constant indices we can run into overflow issues which
3139 can end up fooling the groupsize constraint below so
3140 check the individual gaps (which are represented as
3141 unsigned int) as well. */
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3144 "interleaved access with gap larger "
3145 "than representable\n");
3150 /* FORNOW: SLP of accesses with gaps is not supported. */
3151 slp_impossible
= true;
3152 if (DR_IS_WRITE (data_ref
))
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3156 "interleaved store with gaps\n");
3163 last_accessed_element
+= diff
;
3165 /* Store the gap from the previous member of the group. If there is no
3166 gap in the access, DR_GROUP_GAP is always 1. */
3167 DR_GROUP_GAP (next
) = diff
;
3169 prev_init
= DR_INIT (data_ref
);
3170 next
= DR_GROUP_NEXT_ELEMENT (next
);
3171 /* Count the number of data-refs in the chain. */
3176 groupsize
= count
+ gaps
;
3178 /* This could be UINT_MAX but as we are generating code in a very
3179 inefficient way we have to cap earlier. See PR78699 for example. */
3180 if (groupsize
> 4096)
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3184 "group is too large\n");
3188 /* Check that the size of the interleaving is equal to count for stores,
3189 i.e., that there are no gaps. */
3190 if (groupsize
!= count
3191 && !DR_IS_READ (dr
))
3194 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3197 /* If there is a gap after the last load in the group it is the
3198 difference between the groupsize and the last accessed
3200 When there is no gap, this difference should be 0. */
3201 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
3203 DR_GROUP_SIZE (stmt_info
) = groupsize
;
3204 if (dump_enabled_p ())
3206 dump_printf_loc (MSG_NOTE
, vect_location
,
3207 "Detected interleaving ");
3208 if (DR_IS_READ (dr
))
3209 dump_printf (MSG_NOTE
, "load ");
3210 else if (STMT_VINFO_STRIDED_P (stmt_info
))
3211 dump_printf (MSG_NOTE
, "strided store ");
3213 dump_printf (MSG_NOTE
, "store ");
3214 dump_printf (MSG_NOTE
, "of size %u\n",
3215 (unsigned)groupsize
);
3216 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
3217 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3220 if (DR_GROUP_GAP (next
) != 1)
3221 dump_printf_loc (MSG_NOTE
, vect_location
,
3222 "\t<gap of %d elements>\n",
3223 DR_GROUP_GAP (next
) - 1);
3224 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
3225 next
= DR_GROUP_NEXT_ELEMENT (next
);
3227 if (DR_GROUP_GAP (stmt_info
) != 0)
3228 dump_printf_loc (MSG_NOTE
, vect_location
,
3229 "\t<gap of %d elements>\n",
3230 DR_GROUP_GAP (stmt_info
));
3233 /* SLP: create an SLP data structure for every interleaving group of
3234 stores for further analysis in vect_analyse_slp. */
3235 if (DR_IS_WRITE (dr
) && !slp_impossible
)
3238 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
3240 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3247 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3248 accesses of legal size, step, etc. Detect gaps, single element
3249 interleaving, and other special cases. Set grouped access info.
3250 Collect groups of strided stores for further use in SLP analysis. */
3253 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3255 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
3257 /* Dissolve the group if present. */
3258 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
3261 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3262 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3263 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3271 /* Analyze the access pattern of the data-reference DR_INFO.
3272 In case of non-consecutive accesses call vect_analyze_group_access() to
3273 analyze groups of accesses. */
3276 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3278 data_reference
*dr
= dr_info
->dr
;
3279 tree step
= DR_STEP (dr
);
3280 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
3281 stmt_vec_info stmt_info
= dr_info
->stmt
;
3282 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
3283 class loop
*loop
= NULL
;
3285 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
3289 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3291 if (loop_vinfo
&& !step
)
3293 if (dump_enabled_p ())
3294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3295 "bad data-ref access in loop\n");
3299 /* Allow loads with zero step in inner-loop vectorization. */
3300 if (loop_vinfo
&& integer_zerop (step
))
3302 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3303 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3304 if (!nested_in_vect_loop_p (loop
, stmt_info
))
3305 return DR_IS_READ (dr
);
3306 /* Allow references with zero step for outer loops marked
3307 with pragma omp simd only - it guarantees absence of
3308 loop-carried dependencies between inner loop iterations. */
3309 if (loop
->safelen
< 2)
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_NOTE
, vect_location
,
3313 "zero step in inner loop of nest\n");
3318 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3320 /* Interleaved accesses are not yet supported within outer-loop
3321 vectorization for references in the inner-loop. */
3322 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3323 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3325 /* For the rest of the analysis we use the outer-loop step. */
3326 step
= STMT_VINFO_DR_STEP (stmt_info
);
3327 if (integer_zerop (step
))
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE
, vect_location
,
3331 "zero step in outer loop.\n");
3332 return DR_IS_READ (dr
);
3337 if (TREE_CODE (step
) == INTEGER_CST
)
3339 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
3340 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
3342 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
3344 /* Mark that it is not interleaving. */
3345 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
3346 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
3351 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
3353 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_NOTE
, vect_location
,
3355 "grouped access in outer loop.\n");
3360 /* Assume this is a DR handled by non-constant strided load case. */
3361 if (TREE_CODE (step
) != INTEGER_CST
)
3362 return (STMT_VINFO_STRIDED_P (stmt_info
)
3363 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3364 || vect_analyze_group_access (vinfo
, dr_info
)));
3366 /* Not consecutive access - check if it's a part of interleaving group. */
3367 return vect_analyze_group_access (vinfo
, dr_info
);
3370 /* Compare two data-references DRA and DRB to group them into chunks
3371 suitable for grouping. */
3374 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
3376 dr_vec_info
*dra_info
= *(dr_vec_info
**)const_cast<void *>(dra_
);
3377 dr_vec_info
*drb_info
= *(dr_vec_info
**)const_cast<void *>(drb_
);
3378 data_reference_p dra
= dra_info
->dr
;
3379 data_reference_p drb
= drb_info
->dr
;
3382 /* Stabilize sort. */
3386 /* Different group IDs lead never belong to the same group. */
3387 if (dra_info
->group
!= drb_info
->group
)
3388 return dra_info
->group
< drb_info
->group
? -1 : 1;
3390 /* Ordering of DRs according to base. */
3391 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3392 DR_BASE_ADDRESS (drb
));
3396 /* And according to DR_OFFSET. */
3397 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
3401 /* Put reads before writes. */
3402 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
3403 return DR_IS_READ (dra
) ? -1 : 1;
3405 /* Then sort after access size. */
3406 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
3407 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
3411 /* And after step. */
3412 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
3416 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3417 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
3419 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
3423 /* If OP is the result of a conversion, return the unconverted value,
3424 otherwise return null. */
3427 strip_conversion (tree op
)
3429 if (TREE_CODE (op
) != SSA_NAME
)
3431 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
3432 if (!is_gimple_assign (stmt
)
3433 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
3435 return gimple_assign_rhs1 (stmt
);
3438 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3439 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3440 be grouped in SLP mode. */
3443 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
3446 if (gimple_assign_single_p (stmt1_info
->stmt
))
3447 return gimple_assign_single_p (stmt2_info
->stmt
);
3449 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
3450 if (call1
&& gimple_call_internal_p (call1
))
3452 /* Check for two masked loads or two masked stores. */
3453 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
3454 if (!call2
|| !gimple_call_internal_p (call2
))
3456 internal_fn ifn
= gimple_call_internal_fn (call1
);
3457 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
3459 if (ifn
!= gimple_call_internal_fn (call2
))
3462 /* Check that the masks are the same. Cope with casts of masks,
3463 like those created by build_mask_conversion. */
3464 tree mask1
= gimple_call_arg (call1
, 2);
3465 tree mask2
= gimple_call_arg (call2
, 2);
3466 if (!operand_equal_p (mask1
, mask2
, 0) && !allow_slp_p
)
3468 mask1
= strip_conversion (mask1
);
3471 mask2
= strip_conversion (mask2
);
3474 if (!operand_equal_p (mask1
, mask2
, 0))
3483 /* Function vect_analyze_data_ref_accesses.
3485 Analyze the access pattern of all the data references in the loop.
3487 FORNOW: the only access pattern that is considered vectorizable is a
3488 simple step 1 (consecutive) access.
3490 FORNOW: handle only arrays and pointer accesses. */
3493 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
3494 vec
<int> *dataref_groups
)
3497 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
3499 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3501 if (datarefs
.is_empty ())
3502 return opt_result::success ();
3504 /* Sort the array of datarefs to make building the interleaving chains
3505 linear. Don't modify the original vector's order, it is needed for
3506 determining what dependencies are reversed. */
3507 vec
<dr_vec_info
*> datarefs_copy
;
3508 datarefs_copy
.create (datarefs
.length ());
3509 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
3511 dr_vec_info
*dr_info
= vinfo
->lookup_dr (datarefs
[i
]);
3512 /* If the caller computed DR grouping use that, otherwise group by
3515 dr_info
->group
= (*dataref_groups
)[i
];
3517 dr_info
->group
= gimple_bb (DR_STMT (datarefs
[i
]))->index
;
3518 datarefs_copy
.quick_push (dr_info
);
3520 datarefs_copy
.qsort (dr_group_sort_cmp
);
3521 hash_set
<stmt_vec_info
> to_fixup
;
3523 /* Build the interleaving chains. */
3524 for (i
= 0; i
< datarefs_copy
.length () - 1;)
3526 dr_vec_info
*dr_info_a
= datarefs_copy
[i
];
3527 data_reference_p dra
= dr_info_a
->dr
;
3528 int dra_group_id
= dr_info_a
->group
;
3529 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
3530 stmt_vec_info lastinfo
= NULL
;
3531 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
3532 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
3537 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
3539 dr_vec_info
*dr_info_b
= datarefs_copy
[i
];
3540 data_reference_p drb
= dr_info_b
->dr
;
3541 int drb_group_id
= dr_info_b
->group
;
3542 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
3543 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
3544 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3547 /* ??? Imperfect sorting (non-compatible types, non-modulo
3548 accesses, same accesses) can lead to a group to be artificially
3549 split here as we don't just skip over those. If it really
3550 matters we can push those to a worklist and re-iterate
3551 over them. The we can just skip ahead to the next DR here. */
3553 /* DRs in a different DR group should not be put into the same
3554 interleaving group. */
3555 if (dra_group_id
!= drb_group_id
)
3558 /* Check that the data-refs have same first location (except init)
3559 and they are both either store or load (not load and store,
3560 not masked loads or stores). */
3561 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3562 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3563 DR_BASE_ADDRESS (drb
)) != 0
3564 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3565 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3568 /* Check that the data-refs have the same constant size. */
3569 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3570 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3571 if (!tree_fits_uhwi_p (sza
)
3572 || !tree_fits_uhwi_p (szb
)
3573 || !tree_int_cst_equal (sza
, szb
))
3576 /* Check that the data-refs have the same step. */
3577 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3580 /* Check the types are compatible.
3581 ??? We don't distinguish this during sorting. */
3582 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3583 TREE_TYPE (DR_REF (drb
))))
3586 /* Check that the DR_INITs are compile-time constants. */
3587 if (!tree_fits_shwi_p (DR_INIT (dra
))
3588 || !tree_fits_shwi_p (DR_INIT (drb
)))
3591 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3592 just hold extra information. */
3593 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3594 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3595 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3598 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3599 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3600 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3601 HOST_WIDE_INT init_prev
3602 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]->dr
));
3603 gcc_assert (init_a
<= init_b
3604 && init_a
<= init_prev
3605 && init_prev
<= init_b
);
3607 /* Do not place the same access in the interleaving chain twice. */
3608 if (init_b
== init_prev
)
3610 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]->dr
))
3611 < gimple_uid (DR_STMT (drb
)));
3612 /* Simply link in duplicates and fix up the chain below. */
3616 /* If init_b == init_a + the size of the type * k, we have an
3617 interleaving, and DRA is accessed before DRB. */
3618 unsigned HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3619 if (type_size_a
== 0
3620 || (((unsigned HOST_WIDE_INT
)init_b
- init_a
)
3621 % type_size_a
!= 0))
3624 /* If we have a store, the accesses are adjacent. This splits
3625 groups into chunks we support (we don't support vectorization
3626 of stores with gaps). */
3627 if (!DR_IS_READ (dra
)
3628 && (((unsigned HOST_WIDE_INT
)init_b
- init_prev
)
3632 /* If the step (if not zero or non-constant) is smaller than the
3633 difference between data-refs' inits this splits groups into
3635 if (tree_fits_shwi_p (DR_STEP (dra
)))
3637 unsigned HOST_WIDE_INT step
3638 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3640 && step
<= ((unsigned HOST_WIDE_INT
)init_b
- init_a
))
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE
, vect_location
,
3648 ? "Detected interleaving load %T and %T\n"
3649 : "Detected interleaving store %T and %T\n",
3650 DR_REF (dra
), DR_REF (drb
));
3652 /* Link the found element into the group list. */
3653 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3655 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3656 lastinfo
= stmtinfo_a
;
3658 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3659 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3660 lastinfo
= stmtinfo_b
;
3662 if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3664 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3665 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3667 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3668 dump_printf_loc (MSG_NOTE
, vect_location
,
3669 "Load suitable for SLP vectorization only.\n");
3672 if (init_b
== init_prev
3673 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3674 && dump_enabled_p ())
3675 dump_printf_loc (MSG_NOTE
, vect_location
,
3676 "Queuing group with duplicate access for fixup\n");
3680 /* Fixup groups with duplicate entries by splitting it. */
3683 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3684 if (!(it
!= to_fixup
.end ()))
3686 stmt_vec_info grp
= *it
;
3687 to_fixup
.remove (grp
);
3689 /* Find the earliest duplicate group member. */
3690 unsigned first_duplicate
= -1u;
3691 stmt_vec_info next
, g
= grp
;
3692 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3694 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3695 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3696 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3697 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3700 if (first_duplicate
== -1U)
3703 /* Then move all stmts after the first duplicate to a new group.
3704 Note this is a heuristic but one with the property that *it
3705 is fixed up completely. */
3707 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3708 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3710 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3712 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3716 STMT_VINFO_SLP_VECT_ONLY (newgroup
)
3717 = STMT_VINFO_SLP_VECT_ONLY (grp
);
3720 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3722 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3725 g
= DR_GROUP_NEXT_ELEMENT (g
);
3727 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3729 /* Fixup the new group which still may contain duplicates. */
3730 to_fixup
.add (newgroup
);
3733 dr_vec_info
*dr_info
;
3734 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_info
)
3736 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3737 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3739 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3741 "not vectorized: complicated access pattern.\n");
3743 if (is_a
<bb_vec_info
> (vinfo
))
3745 /* Mark the statement as not vectorizable. */
3746 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3751 datarefs_copy
.release ();
3752 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3754 " complicated access pattern.\n");
3759 datarefs_copy
.release ();
3760 return opt_result::success ();
3763 /* Function vect_vfa_segment_size.
3766 DR_INFO: The data reference.
3767 LENGTH_FACTOR: segment length to consider.
3769 Return a value suitable for the dr_with_seg_len::seg_len field.
3770 This is the "distance travelled" by the pointer from the first
3771 iteration in the segment to the last. Note that it does not include
3772 the size of the access; in effect it only describes the first byte. */
3775 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3777 length_factor
= size_binop (MINUS_EXPR
,
3778 fold_convert (sizetype
, length_factor
),
3780 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3784 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3785 gives the worst-case number of bytes covered by the segment. */
3787 static unsigned HOST_WIDE_INT
3788 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3790 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3791 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3792 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3793 unsigned HOST_WIDE_INT access_size
= ref_size
;
3794 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3796 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3797 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3799 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3801 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3802 && ((misalignment
= dr_misalignment (dr_info
, vectype
)), true)
3803 && (vect_supportable_dr_alignment (vinfo
, dr_info
, vectype
, misalignment
)
3804 == dr_explicit_realign_optimized
))
3806 /* We might access a full vector's worth. */
3807 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3812 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3816 vect_vfa_align (dr_vec_info
*dr_info
)
3818 return dr_alignment (dr_info
->dr
);
3821 /* Function vect_no_alias_p.
3823 Given data references A and B with equal base and offset, see whether
3824 the alias relation can be decided at compilation time. Return 1 if
3825 it can and the references alias, 0 if it can and the references do
3826 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3827 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3828 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3831 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3832 tree segment_length_a
, tree segment_length_b
,
3833 unsigned HOST_WIDE_INT access_size_a
,
3834 unsigned HOST_WIDE_INT access_size_b
)
3836 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3837 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3838 poly_uint64 const_length_a
;
3839 poly_uint64 const_length_b
;
3841 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3842 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3844 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3846 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3847 offset_a
-= const_length_a
;
3850 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3851 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3853 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3854 offset_b
-= const_length_b
;
3857 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3859 const_length_a
+= access_size_a
;
3860 const_length_b
+= access_size_b
;
3862 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3863 offset_b
, const_length_b
))
3866 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3867 offset_b
, const_length_b
))
3873 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3877 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3878 unsigned int loop_depth
, poly_uint64 vf
)
3880 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3881 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3884 /* If the dependence is exact, we should have limited the VF instead. */
3885 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3888 lambda_vector dist_v
;
3889 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3891 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3893 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3894 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3898 if (dump_enabled_p ())
3899 dump_printf_loc (MSG_NOTE
, vect_location
,
3900 "dependence distance between %T and %T is >= VF\n",
3901 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3906 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3909 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3911 dump_printf (dump_kind
, "%s (%T) >= ",
3912 lower_bound
.unsigned_p
? "unsigned" : "abs",
3914 dump_dec (dump_kind
, lower_bound
.min_value
);
3917 /* Record that the vectorized loop requires the vec_lower_bound described
3918 by EXPR, UNSIGNED_P and MIN_VALUE. */
3921 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3922 poly_uint64 min_value
)
3924 vec
<vec_lower_bound
> &lower_bounds
3925 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3926 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3927 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3929 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3930 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3931 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3932 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3934 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3935 lower_bounds
[i
].min_value
= min_value
;
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE
, vect_location
,
3939 "updating run-time check to ");
3940 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3941 dump_printf (MSG_NOTE
, "\n");
3947 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3948 if (dump_enabled_p ())
3950 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3951 dump_lower_bound (MSG_NOTE
, lower_bound
);
3952 dump_printf (MSG_NOTE
, "\n");
3954 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3957 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3958 will span fewer than GAP bytes. */
3961 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3964 stmt_vec_info stmt_info
= dr_info
->stmt
;
3966 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3967 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3968 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3969 return (estimated_poly_value (gap
)
3970 <= count
* vect_get_scalar_dr_size (dr_info
));
3973 /* Return true if we know that there is no alias between DR_INFO_A and
3974 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3975 When returning true, set *LOWER_BOUND_OUT to this N. */
3978 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3979 poly_uint64
*lower_bound_out
)
3981 /* Check that there is a constant gap of known sign between DR_A
3983 data_reference
*dr_a
= dr_info_a
->dr
;
3984 data_reference
*dr_b
= dr_info_b
->dr
;
3985 poly_int64 init_a
, init_b
;
3986 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3987 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3988 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3989 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3990 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3991 || !ordered_p (init_a
, init_b
))
3994 /* Sort DR_A and DR_B by the address they access. */
3995 if (maybe_lt (init_b
, init_a
))
3997 std::swap (init_a
, init_b
);
3998 std::swap (dr_info_a
, dr_info_b
);
3999 std::swap (dr_a
, dr_b
);
4002 /* If the two accesses could be dependent within a scalar iteration,
4003 make sure that we'd retain their order. */
4004 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
4005 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
4008 /* There is no alias if abs (DR_STEP) is greater than or equal to
4009 the bytes spanned by the combination of the two accesses. */
4010 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
4014 /* Function vect_prune_runtime_alias_test_list.
4016 Prune a list of ddrs to be tested at run-time by versioning for alias.
4017 Merge several alias checks into one if possible.
4018 Return FALSE if resulting list of ddrs is longer then allowed by
4019 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
4022 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
4024 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
4025 hash_set
<tree_pair_hash
> compared_objects
;
4027 const vec
<ddr_p
> &may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
4028 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
4029 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
4030 const vec
<vec_object_pair
> &check_unequal_addrs
4031 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
4032 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
4033 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
4039 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
4041 /* Step values are irrelevant for aliasing if the number of vector
4042 iterations is equal to the number of scalar iterations (which can
4043 happen for fully-SLP loops). */
4044 bool vf_one_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
4048 /* Convert the checks for nonzero steps into bound tests. */
4050 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
4051 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
4054 if (may_alias_ddrs
.is_empty ())
4055 return opt_result::success ();
4057 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
4059 unsigned int loop_depth
4060 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
4061 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
4063 /* First, we collect all data ref pairs for aliasing checks. */
4064 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
4066 poly_uint64 lower_bound
;
4067 tree segment_length_a
, segment_length_b
;
4068 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
4069 unsigned int align_a
, align_b
;
4071 /* Ignore the alias if the VF we chose ended up being no greater
4072 than the dependence distance. */
4073 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
4076 if (DDR_OBJECT_A (ddr
))
4078 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
4079 if (!compared_objects
.add (new_pair
))
4081 if (dump_enabled_p ())
4082 dump_printf_loc (MSG_NOTE
, vect_location
,
4083 "checking that %T and %T"
4084 " have different addresses\n",
4085 new_pair
.first
, new_pair
.second
);
4086 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
4091 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
4092 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
4094 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
4095 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
4097 bool preserves_scalar_order_p
4098 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
4101 && (preserves_scalar_order_p
4102 || operand_equal_p (DR_STEP (dr_info_a
->dr
),
4103 DR_STEP (dr_info_b
->dr
))));
4105 /* Skip the pair if inter-iteration dependencies are irrelevant
4106 and intra-iteration dependencies are guaranteed to be honored. */
4108 && (preserves_scalar_order_p
4109 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_NOTE
, vect_location
,
4114 "no need for alias check between "
4115 "%T and %T when VF is 1\n",
4116 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
4120 /* See whether we can handle the alias using a bounds check on
4121 the step, and whether that's likely to be the best approach.
4122 (It might not be, for example, if the minimum step is much larger
4123 than the number of bytes handled by one vector iteration.) */
4125 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
4126 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
4128 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
4129 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
4131 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
4132 if (dump_enabled_p ())
4134 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
4135 "%T and %T when the step %T is outside ",
4136 DR_REF (dr_info_a
->dr
),
4137 DR_REF (dr_info_b
->dr
),
4138 DR_STEP (dr_info_a
->dr
));
4140 dump_printf (MSG_NOTE
, "[0");
4143 dump_printf (MSG_NOTE
, "(");
4144 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
4146 dump_printf (MSG_NOTE
, ", ");
4147 dump_dec (MSG_NOTE
, lower_bound
);
4148 dump_printf (MSG_NOTE
, ")\n");
4150 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
4151 unsigned_p
, lower_bound
);
4155 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
4156 if (dr_group_first_a
)
4158 stmt_info_a
= dr_group_first_a
;
4159 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
4162 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
4163 if (dr_group_first_b
)
4165 stmt_info_b
= dr_group_first_b
;
4166 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
4171 segment_length_a
= size_zero_node
;
4172 segment_length_b
= size_zero_node
;
4176 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
4177 DR_STEP (dr_info_b
->dr
), 0))
4178 length_factor
= scalar_loop_iters
;
4180 length_factor
= size_int (vect_factor
);
4181 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
4182 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
4184 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
4185 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
4186 align_a
= vect_vfa_align (dr_info_a
);
4187 align_b
= vect_vfa_align (dr_info_b
);
4189 /* See whether the alias is known at compilation time. */
4190 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
4191 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
4192 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
4193 DR_OFFSET (dr_info_b
->dr
), 0)
4194 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
4195 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
4196 && poly_int_tree_p (segment_length_a
)
4197 && poly_int_tree_p (segment_length_b
))
4199 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
4204 if (res
>= 0 && dump_enabled_p ())
4206 dump_printf_loc (MSG_NOTE
, vect_location
,
4207 "can tell at compile time that %T and %T",
4208 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
4210 dump_printf (MSG_NOTE
, " do not alias\n");
4212 dump_printf (MSG_NOTE
, " alias\n");
4219 return opt_result::failure_at (stmt_info_b
->stmt
,
4221 " compilation time alias: %G%G",
4226 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
4227 access_size_a
, align_a
);
4228 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
4229 access_size_b
, align_b
);
4230 /* Canonicalize the order to be the one that's needed for accurate
4231 RAW, WAR and WAW flags, in cases where the data references are
4232 well-ordered. The order doesn't really matter otherwise,
4233 but we might as well be consistent. */
4234 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
4235 std::swap (dr_a
, dr_b
);
4237 dr_with_seg_len_pair_t dr_with_seg_len_pair
4238 (dr_a
, dr_b
, (preserves_scalar_order_p
4239 ? dr_with_seg_len_pair_t::WELL_ORDERED
4240 : dr_with_seg_len_pair_t::REORDERED
));
4242 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
4245 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
4247 unsigned int count
= (comp_alias_ddrs
.length ()
4248 + check_unequal_addrs
.length ());
4251 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
))
4252 == VECT_COST_MODEL_VERY_CHEAP
))
4253 return opt_result::failure_at
4254 (vect_location
, "would need a runtime alias check\n");
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_NOTE
, vect_location
,
4258 "improved number of alias checks from %d to %d\n",
4259 may_alias_ddrs
.length (), count
);
4260 unsigned limit
= param_vect_max_version_for_alias_checks
;
4261 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)) == VECT_COST_MODEL_CHEAP
)
4262 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
4264 return opt_result::failure_at
4266 "number of versioning for alias run-time tests exceeds %d "
4267 "(--param vect-max-version-for-alias-checks)\n", limit
);
4269 return opt_result::success ();
4272 /* Check whether we can use an internal function for a gather load
4273 or scatter store. READ_P is true for loads and false for stores.
4274 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4275 the type of the memory elements being loaded or stored. OFFSET_TYPE
4276 is the type of the offset that is being applied to the invariant
4277 base address. SCALE is the amount by which the offset should
4278 be multiplied *after* it has been converted to address width.
4280 Return true if the function is supported, storing the function id in
4281 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
4283 If we can use gather and store the possible else values in ELSVALS. */
4286 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
4287 tree vectype
, tree memory_type
, tree offset_type
,
4288 int scale
, internal_fn
*ifn_out
,
4289 tree
*offset_vectype_out
, vec
<int> *elsvals
)
4291 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
4292 unsigned int element_bits
= vector_element_bits (vectype
);
4293 if (element_bits
!= memory_bits
)
4294 /* For now the vector elements must be the same width as the
4298 /* Work out which function we need. */
4299 internal_fn ifn
, alt_ifn
, alt_ifn2
;
4302 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
4303 alt_ifn
= IFN_MASK_GATHER_LOAD
;
4304 /* When target supports MASK_LEN_GATHER_LOAD, we always
4305 use MASK_LEN_GATHER_LOAD regardless whether len and
4306 mask are valid or not. */
4307 alt_ifn2
= IFN_MASK_LEN_GATHER_LOAD
;
4311 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
4312 alt_ifn
= IFN_MASK_SCATTER_STORE
;
4313 /* When target supports MASK_LEN_SCATTER_STORE, we always
4314 use MASK_LEN_SCATTER_STORE regardless whether len and
4315 mask are valid or not. */
4316 alt_ifn2
= IFN_MASK_LEN_SCATTER_STORE
;
4321 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
4322 if (!offset_vectype
)
4325 /* Test whether the target supports this combination. */
4326 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
4327 offset_vectype
, scale
,
4331 *offset_vectype_out
= offset_vectype
;
4335 && internal_gather_scatter_fn_supported_p (alt_ifn
, vectype
,
4341 *offset_vectype_out
= offset_vectype
;
4344 else if (internal_gather_scatter_fn_supported_p (alt_ifn2
, vectype
,
4346 offset_vectype
, scale
,
4349 *ifn_out
= alt_ifn2
;
4350 *offset_vectype_out
= offset_vectype
;
4354 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
4355 && TYPE_PRECISION (offset_type
) >= element_bits
)
4358 offset_type
= build_nonstandard_integer_type
4359 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
4363 /* STMT_INFO is a call to an internal gather load or scatter store function.
4364 Describe the operation in INFO. */
4367 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
4368 gather_scatter_info
*info
)
4370 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
4371 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4372 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4374 info
->ifn
= gimple_call_internal_fn (call
);
4375 info
->decl
= NULL_TREE
;
4376 info
->base
= gimple_call_arg (call
, 0);
4377 info
->offset
= gimple_call_arg (call
, 1);
4378 info
->offset_dt
= vect_unknown_def_type
;
4379 info
->offset_vectype
= NULL_TREE
;
4380 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
4381 info
->element_type
= TREE_TYPE (vectype
);
4382 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
4385 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4386 gather load or scatter store. Describe the operation in *INFO if so.
4387 If it is suitable and ELSVALS is nonzero store the supported else values
4388 in the vector it points to. */
4391 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
4392 gather_scatter_info
*info
, vec
<int> *elsvals
)
4394 HOST_WIDE_INT scale
= 1;
4395 poly_int64 pbitpos
, pbitsize
;
4396 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4397 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4398 tree offtype
= NULL_TREE
;
4399 tree decl
= NULL_TREE
, base
, off
;
4400 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4401 tree memory_type
= TREE_TYPE (DR_REF (dr
));
4403 int punsignedp
, reversep
, pvolatilep
= 0;
4405 tree offset_vectype
;
4406 bool masked_p
= false;
4408 /* See whether this is already a call to a gather/scatter internal function.
4409 If not, see whether it's a masked load or store. */
4410 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4411 if (call
&& gimple_call_internal_p (call
))
4413 ifn
= gimple_call_internal_fn (call
);
4414 if (internal_gather_scatter_fn_p (ifn
))
4416 vect_describe_gather_scatter_call (stmt_info
, info
);
4418 /* In pattern recog we simply used a ZERO else value that
4419 we need to correct here. To that end just re-use the
4420 (already succesful) check if we support a gather IFN
4421 and have it populate the else values. */
4422 if (DR_IS_READ (dr
) && internal_fn_mask_index (ifn
) >= 0 && elsvals
)
4423 supports_vec_gather_load_p (TYPE_MODE (vectype
), elsvals
);
4426 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
4429 /* ??? For epilogues we adjust DR_REF to make the following stmt-based
4430 analysis work, but this adjustment doesn't work for epilogues of
4431 epilogues during transform, so disable gather/scatter in that case. */
4432 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4433 && LOOP_VINFO_EPILOGUE_P (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)))
4436 /* True if we should aim to use internal functions rather than
4437 built-in functions. */
4438 bool use_ifn_p
= (DR_IS_READ (dr
)
4439 ? supports_vec_gather_load_p (TYPE_MODE (vectype
),
4441 : supports_vec_scatter_store_p (TYPE_MODE (vectype
)));
4444 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4445 see if we can use the def stmt of the address. */
4447 && TREE_CODE (base
) == MEM_REF
4448 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
4449 && integer_zerop (TREE_OPERAND (base
, 1))
4450 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
4452 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
4453 if (is_gimple_assign (def_stmt
)
4454 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
4455 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
4458 /* The gather and scatter builtins need address of the form
4459 loop_invariant + vector * {1, 2, 4, 8}
4461 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4462 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4463 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4464 multiplications and additions in it. To get a vector, we need
4465 a single SSA_NAME that will be defined in the loop and will
4466 contain everything that is not loop invariant and that can be
4467 vectorized. The following code attempts to find such a preexistng
4468 SSA_NAME OFF and put the loop invariants into a tree BASE
4469 that can be gimplified before the loop. */
4470 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
4471 &punsignedp
, &reversep
, &pvolatilep
);
4475 /* PR 107346. Packed structs can have fields at offsets that are not
4476 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4477 if (!multiple_p (pbitpos
, BITS_PER_UNIT
))
4480 /* We need to be able to form an address to the base which for example
4481 isn't possible for hard registers. */
4482 if (may_be_nonaddressable_p (base
))
4485 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
4487 if (TREE_CODE (base
) == MEM_REF
)
4489 if (!integer_zerop (TREE_OPERAND (base
, 1)))
4491 if (off
== NULL_TREE
)
4492 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
4494 off
= size_binop (PLUS_EXPR
, off
,
4495 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
4497 base
= TREE_OPERAND (base
, 0);
4500 base
= build_fold_addr_expr (base
);
4502 if (off
== NULL_TREE
)
4503 off
= size_zero_node
;
4505 /* If base is not loop invariant, either off is 0, then we start with just
4506 the constant offset in the loop invariant BASE and continue with base
4507 as OFF, otherwise give up.
4508 We could handle that case by gimplifying the addition of base + off
4509 into some SSA_NAME and use that as off, but for now punt. */
4510 if (!expr_invariant_in_loop_p (loop
, base
))
4512 if (!integer_zerop (off
))
4515 base
= size_int (pbytepos
);
4517 /* Otherwise put base + constant offset into the loop invariant BASE
4518 and continue with OFF. */
4521 base
= fold_convert (sizetype
, base
);
4522 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
4525 /* OFF at this point may be either a SSA_NAME or some tree expression
4526 from get_inner_reference. Try to peel off loop invariants from it
4527 into BASE as long as possible. */
4529 while (offtype
== NULL_TREE
)
4531 enum tree_code code
;
4532 tree op0
, op1
, add
= NULL_TREE
;
4534 if (TREE_CODE (off
) == SSA_NAME
)
4536 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
4538 if (expr_invariant_in_loop_p (loop
, off
))
4541 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
4544 op0
= gimple_assign_rhs1 (def_stmt
);
4545 code
= gimple_assign_rhs_code (def_stmt
);
4546 op1
= gimple_assign_rhs2 (def_stmt
);
4550 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
4552 code
= TREE_CODE (off
);
4553 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
4557 case POINTER_PLUS_EXPR
:
4559 if (expr_invariant_in_loop_p (loop
, op0
))
4564 add
= fold_convert (sizetype
, add
);
4566 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
4567 base
= size_binop (PLUS_EXPR
, base
, add
);
4570 if (expr_invariant_in_loop_p (loop
, op1
))
4578 if (expr_invariant_in_loop_p (loop
, op1
))
4580 add
= fold_convert (sizetype
, op1
);
4581 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
4587 if (scale
== 1 && tree_fits_shwi_p (op1
))
4589 int new_scale
= tree_to_shwi (op1
);
4590 /* Only treat this as a scaling operation if the target
4591 supports it for at least some offset type. */
4593 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4594 masked_p
, vectype
, memory_type
,
4595 signed_char_type_node
,
4599 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4600 masked_p
, vectype
, memory_type
,
4601 unsigned_char_type_node
,
4615 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
4616 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
4619 /* Don't include the conversion if the target is happy with
4620 the current offset type. */
4622 && TREE_CODE (off
) == SSA_NAME
4623 && !POINTER_TYPE_P (TREE_TYPE (off
))
4624 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
4625 masked_p
, vectype
, memory_type
,
4626 TREE_TYPE (off
), scale
, &ifn
,
4627 &offset_vectype
, elsvals
))
4630 if (TYPE_PRECISION (TREE_TYPE (op0
))
4631 == TYPE_PRECISION (TREE_TYPE (off
)))
4637 /* Include the conversion if it is widening and we're using
4638 the IFN path or the target can handle the converted from
4639 offset or the current size is not already the same as the
4640 data vector element size. */
4641 if ((TYPE_PRECISION (TREE_TYPE (op0
))
4642 < TYPE_PRECISION (TREE_TYPE (off
)))
4645 ? (targetm
.vectorize
.builtin_gather
4646 && targetm
.vectorize
.builtin_gather (vectype
,
4649 : (targetm
.vectorize
.builtin_scatter
4650 && targetm
.vectorize
.builtin_scatter (vectype
,
4653 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off
)),
4654 TYPE_SIZE (TREE_TYPE (vectype
)), 0)))
4657 offtype
= TREE_TYPE (off
);
4668 /* If at the end OFF still isn't a SSA_NAME or isn't
4669 defined in the loop, punt. */
4670 if (TREE_CODE (off
) != SSA_NAME
4671 || expr_invariant_in_loop_p (loop
, off
))
4674 if (offtype
== NULL_TREE
)
4675 offtype
= TREE_TYPE (off
);
4679 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4680 vectype
, memory_type
, offtype
, scale
,
4681 &ifn
, &offset_vectype
, elsvals
))
4687 if (DR_IS_READ (dr
))
4689 if (targetm
.vectorize
.builtin_gather
)
4690 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4694 if (targetm
.vectorize
.builtin_scatter
)
4695 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4698 /* The offset vector type will be read from DECL when needed. */
4699 offset_vectype
= NULL_TREE
;
4706 info
->offset_dt
= vect_unknown_def_type
;
4707 info
->offset_vectype
= offset_vectype
;
4708 info
->scale
= scale
;
4709 info
->element_type
= TREE_TYPE (vectype
);
4710 info
->memory_type
= memory_type
;
4714 /* Find the data references in STMT, analyze them with respect to LOOP and
4715 append them to DATAREFS. Return false if datarefs in this stmt cannot
4719 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4720 vec
<data_reference_p
> *datarefs
,
4721 vec
<int> *dataref_groups
, int group_id
)
4723 /* We can ignore clobbers for dataref analysis - they are removed during
4724 loop vectorization and BB vectorization checks dependences with a
4726 if (gimple_clobber_p (stmt
))
4727 return opt_result::success ();
4729 if (gimple_has_volatile_ops (stmt
))
4730 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4733 if (stmt_can_throw_internal (cfun
, stmt
))
4734 return opt_result::failure_at (stmt
,
4736 " statement can throw an exception: %G",
4739 auto_vec
<data_reference_p
, 2> refs
;
4740 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4744 if (refs
.is_empty ())
4745 return opt_result::success ();
4747 if (refs
.length () > 1)
4749 while (!refs
.is_empty ())
4750 free_data_ref (refs
.pop ());
4751 return opt_result::failure_at (stmt
,
4752 "not vectorized: more than one "
4753 "data ref in stmt: %G", stmt
);
4756 data_reference_p dr
= refs
.pop ();
4757 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4758 if (!gimple_call_internal_p (call
)
4759 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4760 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4763 return opt_result::failure_at (stmt
,
4764 "not vectorized: dr in a call %G", stmt
);
4767 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4768 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4771 return opt_result::failure_at (stmt
,
4773 " statement is an unsupported"
4774 " bitfield access %G", stmt
);
4777 if (DR_BASE_ADDRESS (dr
)
4778 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4781 return opt_result::failure_at (stmt
,
4783 " base addr of dr is a constant\n");
4786 /* Check whether this may be a SIMD lane access and adjust the
4787 DR to make it easier for us to handle it. */
4790 && (!DR_BASE_ADDRESS (dr
)
4795 struct data_reference
*newdr
4796 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4797 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4798 if (DR_BASE_ADDRESS (newdr
)
4799 && DR_OFFSET (newdr
)
4802 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4803 && integer_zerop (DR_STEP (newdr
)))
4805 tree base_address
= DR_BASE_ADDRESS (newdr
);
4806 tree off
= DR_OFFSET (newdr
);
4807 tree step
= ssize_int (1);
4808 if (integer_zerop (off
)
4809 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4811 off
= TREE_OPERAND (base_address
, 1);
4812 base_address
= TREE_OPERAND (base_address
, 0);
4815 if (TREE_CODE (off
) == MULT_EXPR
4816 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4818 step
= TREE_OPERAND (off
, 1);
4819 off
= TREE_OPERAND (off
, 0);
4822 if (CONVERT_EXPR_P (off
)
4823 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4824 < TYPE_PRECISION (TREE_TYPE (off
))))
4825 off
= TREE_OPERAND (off
, 0);
4826 if (TREE_CODE (off
) == SSA_NAME
)
4828 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4829 /* Look through widening conversion. */
4830 if (is_gimple_assign (def
)
4831 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4833 tree rhs1
= gimple_assign_rhs1 (def
);
4834 if (TREE_CODE (rhs1
) == SSA_NAME
4835 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4836 && (TYPE_PRECISION (TREE_TYPE (off
))
4837 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4838 def
= SSA_NAME_DEF_STMT (rhs1
);
4840 if (is_gimple_call (def
)
4841 && gimple_call_internal_p (def
)
4842 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4844 tree arg
= gimple_call_arg (def
, 0);
4845 tree reft
= TREE_TYPE (DR_REF (newdr
));
4846 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4847 arg
= SSA_NAME_VAR (arg
);
4848 if (arg
== loop
->simduid
4850 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4852 DR_BASE_ADDRESS (newdr
) = base_address
;
4853 DR_OFFSET (newdr
) = ssize_int (0);
4854 DR_STEP (newdr
) = step
;
4855 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4856 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4857 /* Mark as simd-lane access. */
4858 tree arg2
= gimple_call_arg (def
, 1);
4859 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4861 datarefs
->safe_push (newdr
);
4863 dataref_groups
->safe_push (group_id
);
4864 return opt_result::success ();
4869 free_data_ref (newdr
);
4872 datarefs
->safe_push (dr
);
4874 dataref_groups
->safe_push (group_id
);
4875 return opt_result::success ();
4878 /* Function vect_analyze_data_refs.
4880 Find all the data references in the loop or basic block.
4882 The general structure of the analysis of data refs in the vectorizer is as
4884 1- vect_analyze_data_refs(loop/bb): call
4885 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4886 in the loop/bb and their dependences.
4887 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4888 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4889 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4894 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4896 class loop
*loop
= NULL
;
4898 struct data_reference
*dr
;
4901 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4903 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4904 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4906 /* Go through the data-refs, check that the analysis succeeded. Update
4907 pointer from stmt_vec_info struct to DR and vectype. */
4909 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4910 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4912 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4915 gcc_assert (DR_REF (dr
));
4916 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4917 gcc_assert (!stmt_info
->dr_aux
.dr
);
4918 stmt_info
->dr_aux
.dr
= dr
;
4919 stmt_info
->dr_aux
.stmt
= stmt_info
;
4921 /* Check that analysis of the data-ref succeeded. */
4922 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4927 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4930 && !TREE_THIS_VOLATILE (DR_REF (dr
));
4932 /* If target supports vector gather loads or scatter stores,
4933 see if they can't be used. */
4934 if (is_a
<loop_vec_info
> (vinfo
)
4935 && !nested_in_vect_loop_p (loop
, stmt_info
))
4937 if (maybe_gather
|| maybe_scatter
)
4940 gatherscatter
= GATHER
;
4942 gatherscatter
= SCATTER
;
4946 if (gatherscatter
== SG_NONE
)
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4950 "not vectorized: data ref analysis "
4951 "failed %G", stmt_info
->stmt
);
4952 if (is_a
<bb_vec_info
> (vinfo
))
4954 /* In BB vectorization the ref can still participate
4955 in dependence analysis, we just can't vectorize it. */
4956 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4959 return opt_result::failure_at (stmt_info
->stmt
,
4961 " data ref analysis failed: %G",
4966 /* See if this was detected as SIMD lane access. */
4967 if (dr
->aux
== (void *)-1
4968 || dr
->aux
== (void *)-2
4969 || dr
->aux
== (void *)-3
4970 || dr
->aux
== (void *)-4)
4972 if (nested_in_vect_loop_p (loop
, stmt_info
))
4973 return opt_result::failure_at (stmt_info
->stmt
,
4975 " data ref analysis failed: %G",
4977 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4978 = -(uintptr_t) dr
->aux
;
4981 tree base
= get_base_address (DR_REF (dr
));
4982 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4984 if (dump_enabled_p ())
4985 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4986 "not vectorized: base object not addressable "
4987 "for stmt: %G", stmt_info
->stmt
);
4988 if (is_a
<bb_vec_info
> (vinfo
))
4990 /* In BB vectorization the ref can still participate
4991 in dependence analysis, we just can't vectorize it. */
4992 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4995 return opt_result::failure_at (stmt_info
->stmt
,
4996 "not vectorized: base object not"
4997 " addressable for stmt: %G",
5001 if (is_a
<loop_vec_info
> (vinfo
)
5003 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
5005 if (nested_in_vect_loop_p (loop
, stmt_info
))
5006 return opt_result::failure_at (stmt_info
->stmt
,
5008 "not suitable for strided load %G",
5010 STMT_VINFO_STRIDED_P (stmt_info
) = true;
5013 /* Update DR field in stmt_vec_info struct. */
5015 /* If the dataref is in an inner-loop of the loop that is considered for
5016 for vectorization, we also want to analyze the access relative to
5017 the outer-loop (DR contains information only relative to the
5018 inner-most enclosing loop). We do that by building a reference to the
5019 first location accessed by the inner-loop, and analyze it relative to
5021 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
5023 /* Build a reference to the first location accessed by the
5024 inner loop: *(BASE + INIT + OFFSET). By construction,
5025 this address must be invariant in the inner loop, so we
5026 can consider it as being used in the outer loop. */
5027 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
5028 tree offset
= unshare_expr (DR_OFFSET (dr
));
5029 tree init
= unshare_expr (DR_INIT (dr
));
5030 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
5032 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
5033 tree init_ref
= build_fold_indirect_ref (init_addr
);
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_NOTE
, vect_location
,
5037 "analyze in outer loop: %T\n", init_ref
);
5040 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
5041 init_ref
, loop
, stmt_info
->stmt
);
5043 /* dr_analyze_innermost already explained the failure. */
5046 if (dump_enabled_p ())
5047 dump_printf_loc (MSG_NOTE
, vect_location
,
5048 "\touter base_address: %T\n"
5049 "\touter offset from base address: %T\n"
5050 "\touter constant offset from base address: %T\n"
5051 "\touter step: %T\n"
5052 "\touter base alignment: %d\n\n"
5053 "\touter base misalignment: %d\n"
5054 "\touter offset alignment: %d\n"
5055 "\touter step alignment: %d\n",
5056 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
5057 STMT_VINFO_DR_OFFSET (stmt_info
),
5058 STMT_VINFO_DR_INIT (stmt_info
),
5059 STMT_VINFO_DR_STEP (stmt_info
),
5060 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
5061 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
5062 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
5063 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
5066 /* Set vectype for STMT. */
5067 scalar_type
= TREE_TYPE (DR_REF (dr
));
5068 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
5071 if (dump_enabled_p ())
5073 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5074 "not vectorized: no vectype for stmt: %G",
5076 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
5077 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
5079 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
5082 if (is_a
<bb_vec_info
> (vinfo
))
5084 /* No vector type is fine, the ref can still participate
5085 in dependence analysis, we just can't vectorize it. */
5086 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
5091 return opt_result::failure_at (stmt_info
->stmt
,
5093 " no vectype for stmt: %G"
5094 " scalar_type: %T\n",
5095 stmt_info
->stmt
, scalar_type
);
5099 if (dump_enabled_p ())
5100 dump_printf_loc (MSG_NOTE
, vect_location
,
5101 "got vectype for stmt: %G%T\n",
5102 stmt_info
->stmt
, vectype
);
5105 /* Adjust the minimal vectorization factor according to the
5107 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
5108 *min_vf
= upper_bound (*min_vf
, vf
);
5110 /* Leave the BB vectorizer to pick the vector type later, based on
5111 the final dataref group size and SLP node size. */
5112 if (is_a
<loop_vec_info
> (vinfo
))
5113 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
5115 if (gatherscatter
!= SG_NONE
)
5117 gather_scatter_info gs_info
;
5118 if (!vect_check_gather_scatter (stmt_info
,
5119 as_a
<loop_vec_info
> (vinfo
),
5121 || !get_vectype_for_scalar_type (vinfo
,
5122 TREE_TYPE (gs_info
.offset
)))
5126 return opt_result::failure_at
5128 (gatherscatter
== GATHER
)
5129 ? "not vectorized: not suitable for gather load %G"
5130 : "not vectorized: not suitable for scatter store %G",
5133 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
5137 /* We used to stop processing and prune the list here. Verify we no
5139 gcc_assert (i
== datarefs
.length ());
5141 return opt_result::success ();
5145 /* Function vect_get_new_vect_var.
5147 Returns a name for a new variable. The current naming scheme appends the
5148 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5149 the name of vectorizer generated variables, and appends that to NAME if
5153 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
5160 case vect_simple_var
:
5163 case vect_scalar_var
:
5169 case vect_pointer_var
:
5178 char* tmp
= concat (prefix
, "_", name
, NULL
);
5179 new_vect_var
= create_tmp_reg (type
, tmp
);
5183 new_vect_var
= create_tmp_reg (type
, prefix
);
5185 return new_vect_var
;
5188 /* Like vect_get_new_vect_var but return an SSA name. */
5191 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
5198 case vect_simple_var
:
5201 case vect_scalar_var
:
5204 case vect_pointer_var
:
5213 char* tmp
= concat (prefix
, "_", name
, NULL
);
5214 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
5218 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
5220 return new_vect_var
;
5223 /* Duplicate points-to info on NAME from DR_INFO. */
5226 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
5228 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
5229 /* DR_PTR_INFO is for a base SSA name, not including constant or
5230 variable offsets in the ref so its alignment info does not apply. */
5231 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
5234 /* Function vect_create_addr_base_for_vector_ref.
5236 Create an expression that computes the address of the first memory location
5237 that will be accessed for a data reference.
5240 STMT_INFO: The statement containing the data reference.
5241 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5242 OFFSET: Optional. If supplied, it is be added to the initial address.
5243 LOOP: Specify relative to which loop-nest should the address be computed.
5244 For example, when the dataref is in an inner-loop nested in an
5245 outer-loop that is now being vectorized, LOOP can be either the
5246 outer-loop, or the inner-loop. The first memory location accessed
5247 by the following dataref ('in' points to short):
5254 if LOOP=i_loop: &in (relative to i_loop)
5255 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5258 1. Return an SSA_NAME whose value is the address of the memory location of
5259 the first vector of the data reference.
5260 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5261 these statement(s) which define the returned SSA_NAME.
5263 FORNOW: We are only handling array accesses with step 1. */
5266 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5267 gimple_seq
*new_stmt_list
,
5270 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5271 struct data_reference
*dr
= dr_info
->dr
;
5272 const char *base_name
;
5275 gimple_seq seq
= NULL
;
5277 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5278 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
5280 tree data_ref_base
= unshare_expr (drb
->base_address
);
5281 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
5282 tree init
= unshare_expr (drb
->init
);
5285 base_name
= get_name (data_ref_base
);
5288 base_offset
= ssize_int (0);
5289 init
= ssize_int (0);
5290 base_name
= get_name (DR_REF (dr
));
5293 /* Create base_offset */
5294 base_offset
= size_binop (PLUS_EXPR
,
5295 fold_convert (sizetype
, base_offset
),
5296 fold_convert (sizetype
, init
));
5300 offset
= fold_convert (sizetype
, offset
);
5301 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
5302 base_offset
, offset
);
5305 /* base + base_offset */
5307 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
5309 addr_base
= build1 (ADDR_EXPR
,
5310 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
5311 /* Strip zero offset components since we don't need
5312 them and they can confuse late diagnostics if
5313 we CSE them wrongly. See PR106904 for example. */
5314 unshare_expr (strip_zero_offset_components
5317 vect_ptr_type
= build_pointer_type (TREE_TYPE (DR_REF (dr
)));
5318 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
5319 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
5320 gimple_seq_add_seq (new_stmt_list
, seq
);
5322 if (DR_PTR_INFO (dr
)
5323 && TREE_CODE (addr_base
) == SSA_NAME
5324 /* We should only duplicate pointer info to newly created SSA names. */
5325 && SSA_NAME_VAR (addr_base
) == dest
)
5327 gcc_assert (!SSA_NAME_PTR_INFO (addr_base
));
5328 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
5331 if (dump_enabled_p ())
5332 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
5338 /* Function vect_create_data_ref_ptr.
5340 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5341 location accessed in the loop by STMT_INFO, along with the def-use update
5342 chain to appropriately advance the pointer through the loop iterations.
5343 Also set aliasing information for the pointer. This pointer is used by
5344 the callers to this function to create a memory reference expression for
5345 vector load/store access.
5348 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5349 GIMPLE_ASSIGN <name, data-ref> or
5350 GIMPLE_ASSIGN <data-ref, name>.
5351 2. AGGR_TYPE: the type of the reference, which should be either a vector
5353 3. AT_LOOP: the loop where the vector memref is to be created.
5354 4. OFFSET (optional): a byte offset to be added to the initial address
5355 accessed by the data-ref in STMT_INFO.
5356 5. BSI: location where the new stmts are to be placed if there is no loop
5357 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5358 pointing to the initial address.
5359 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5360 to the IV during each iteration of the loop. NULL says to move
5361 by one copy of AGGR_TYPE up or down, depending on the step of the
5365 1. Declare a new ptr to vector_type, and have it point to the base of the
5366 data reference (initial addressed accessed by the data reference).
5367 For example, for vector of type V8HI, the following code is generated:
5370 ap = (v8hi *)initial_address;
5372 if OFFSET is not supplied:
5373 initial_address = &a[init];
5374 if OFFSET is supplied:
5375 initial_address = &a[init] + OFFSET;
5376 if BYTE_OFFSET is supplied:
5377 initial_address = &a[init] + BYTE_OFFSET;
5379 Return the initial_address in INITIAL_ADDRESS.
5381 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5382 update the pointer in each iteration of the loop.
5384 Return the increment stmt that updates the pointer in PTR_INCR.
5386 3. Return the pointer. */
5389 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5390 tree aggr_type
, class loop
*at_loop
, tree offset
,
5391 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
5392 gimple
**ptr_incr
, bool only_init
,
5395 const char *base_name
;
5396 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5397 class loop
*loop
= NULL
;
5398 bool nested_in_vect_loop
= false;
5399 class loop
*containing_loop
= NULL
;
5403 gimple_seq new_stmt_list
= NULL
;
5407 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5408 struct data_reference
*dr
= dr_info
->dr
;
5410 gimple_stmt_iterator incr_gsi
;
5412 tree indx_before_incr
, indx_after_incr
;
5414 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
5416 gcc_assert (iv_step
!= NULL_TREE
5417 || TREE_CODE (aggr_type
) == ARRAY_TYPE
5418 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
5422 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5423 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5424 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5425 pe
= loop_preheader_edge (loop
);
5429 gcc_assert (bb_vinfo
);
5434 /* Create an expression for the first address accessed by this load
5436 base_name
= get_name (DR_BASE_ADDRESS (dr
));
5438 if (dump_enabled_p ())
5440 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
5441 dump_printf_loc (MSG_NOTE
, vect_location
,
5442 "create %s-pointer variable to type: %T",
5443 get_tree_code_name (TREE_CODE (aggr_type
)),
5445 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
5446 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
5447 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
5448 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
5449 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
5450 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
5452 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
5453 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
5456 /* (1) Create the new aggregate-pointer variable.
5457 Vector and array types inherit the alias set of their component
5458 type by default so we need to use a ref-all pointer if the data
5459 reference does not conflict with the created aggregated data
5460 reference because it is not addressable. */
5461 bool need_ref_all
= false;
5462 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5463 get_alias_set (DR_REF (dr
))))
5464 need_ref_all
= true;
5465 /* Likewise for any of the data references in the stmt group. */
5466 else if (DR_GROUP_SIZE (stmt_info
) > 1)
5468 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5471 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
5472 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
5473 get_alias_set (DR_REF (sdr
))))
5475 need_ref_all
= true;
5478 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
5482 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, VOIDmode
,
5484 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
5487 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5488 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5489 def-use update cycles for the pointer: one relative to the outer-loop
5490 (LOOP), which is what steps (3) and (4) below do. The other is relative
5491 to the inner-loop (which is the inner-most loop containing the dataref),
5492 and this is done be step (5) below.
5494 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5495 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5496 redundant. Steps (3),(4) create the following:
5499 LOOP: vp1 = phi(vp0,vp2)
5505 If there is an inner-loop nested in loop, then step (5) will also be
5506 applied, and an additional update in the inner-loop will be created:
5509 LOOP: vp1 = phi(vp0,vp2)
5511 inner: vp3 = phi(vp1,vp4)
5512 vp4 = vp3 + inner_step
5518 /* (2) Calculate the initial address of the aggregate-pointer, and set
5519 the aggregate-pointer to point to it before the loop. */
5521 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5523 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
5524 stmt_info
, &new_stmt_list
,
5530 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
5531 gcc_assert (!new_bb
);
5534 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
5537 *initial_address
= new_temp
;
5538 aggr_ptr_init
= new_temp
;
5540 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5541 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5542 inner-loop nested in LOOP (during outer-loop vectorization). */
5544 /* No update in loop is required. */
5545 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
5546 aptr
= aggr_ptr_init
;
5549 /* Accesses to invariant addresses should be handled specially
5551 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
5552 gcc_assert (!integer_zerop (step
));
5554 if (iv_step
== NULL_TREE
)
5556 /* The step of the aggregate pointer is the type size,
5557 negated for downward accesses. */
5558 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
5559 if (tree_int_cst_sgn (step
) == -1)
5560 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
5563 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5565 create_iv (aggr_ptr_init
, PLUS_EXPR
,
5566 fold_convert (aggr_ptr_type
, iv_step
),
5567 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
5568 &indx_before_incr
, &indx_after_incr
);
5569 incr
= gsi_stmt (incr_gsi
);
5571 /* Copy the points-to information if it exists. */
5572 if (DR_PTR_INFO (dr
))
5574 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5575 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5580 aptr
= indx_before_incr
;
5583 if (!nested_in_vect_loop
|| only_init
)
5587 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5588 nested in LOOP, if exists. */
5590 gcc_assert (nested_in_vect_loop
);
5593 standard_iv_increment_position (containing_loop
, &incr_gsi
,
5595 create_iv (aptr
, PLUS_EXPR
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)),
5596 aggr_ptr
, containing_loop
, &incr_gsi
, insert_after
,
5597 &indx_before_incr
, &indx_after_incr
);
5598 incr
= gsi_stmt (incr_gsi
);
5600 /* Copy the points-to information if it exists. */
5601 if (DR_PTR_INFO (dr
))
5603 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
5604 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
5609 return indx_before_incr
;
5616 /* Function bump_vector_ptr
5618 Increment a pointer (to a vector type) by vector-size. If requested,
5619 i.e. if PTR-INCR is given, then also connect the new increment stmt
5620 to the existing def-use update-chain of the pointer, by modifying
5621 the PTR_INCR as illustrated below:
5623 The pointer def-use update-chain before this function:
5624 DATAREF_PTR = phi (p_0, p_2)
5626 PTR_INCR: p_2 = DATAREF_PTR + step
5628 The pointer def-use update-chain after this function:
5629 DATAREF_PTR = phi (p_0, p_2)
5631 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5633 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5636 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5638 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5639 the loop. The increment amount across iterations is expected
5641 BSI - location where the new update stmt is to be placed.
5642 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5643 BUMP - optional. The offset by which to bump the pointer. If not given,
5644 the offset is assumed to be vector_size.
5646 Output: Return NEW_DATAREF_PTR as illustrated above.
5651 bump_vector_ptr (vec_info
*vinfo
,
5652 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5653 stmt_vec_info stmt_info
, tree bump
)
5655 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5656 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5657 tree update
= TYPE_SIZE_UNIT (vectype
);
5660 use_operand_p use_p
;
5661 tree new_dataref_ptr
;
5666 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5667 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5668 else if (is_gimple_min_invariant (dataref_ptr
))
5669 /* When possible avoid emitting a separate increment stmt that will
5670 force the addressed object addressable. */
5671 return build1 (ADDR_EXPR
, TREE_TYPE (dataref_ptr
),
5672 fold_build2 (MEM_REF
,
5673 TREE_TYPE (TREE_TYPE (dataref_ptr
)),
5675 fold_convert (ptr_type_node
, update
)));
5677 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5678 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5679 dataref_ptr
, update
);
5680 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5681 /* Fold the increment, avoiding excessive chains use-def chains of
5682 those, leading to compile-time issues for passes until the next
5683 forwprop pass which would do this as well. */
5684 gimple_stmt_iterator fold_gsi
= gsi_for_stmt (incr_stmt
);
5685 if (fold_stmt (&fold_gsi
, follow_all_ssa_edges
))
5687 incr_stmt
= gsi_stmt (fold_gsi
);
5688 update_stmt (incr_stmt
);
5691 /* Copy the points-to information if it exists. */
5692 if (DR_PTR_INFO (dr
))
5694 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5695 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5699 return new_dataref_ptr
;
5701 /* Update the vector-pointer's cross-iteration increment. */
5702 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5704 tree use
= USE_FROM_PTR (use_p
);
5706 if (use
== dataref_ptr
)
5707 SET_USE (use_p
, new_dataref_ptr
);
5709 gcc_assert (operand_equal_p (use
, update
, 0));
5712 return new_dataref_ptr
;
5716 /* Copy memory reference info such as base/clique from the SRC reference
5717 to the DEST MEM_REF. */
5720 vect_copy_ref_info (tree dest
, tree src
)
5722 if (TREE_CODE (dest
) != MEM_REF
)
5725 tree src_base
= src
;
5726 while (handled_component_p (src_base
))
5727 src_base
= TREE_OPERAND (src_base
, 0);
5728 if (TREE_CODE (src_base
) != MEM_REF
5729 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5732 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5733 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5737 /* Function vect_create_destination_var.
5739 Create a new temporary of type VECTYPE. */
5742 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5748 enum vect_var_kind kind
;
5751 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5755 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5757 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5759 name
= get_name (scalar_dest
);
5761 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5763 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5764 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5770 /* Function vect_grouped_store_supported.
5772 Returns TRUE if interleave high and interleave low permutations
5773 are supported, and FALSE otherwise. */
5776 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5778 machine_mode mode
= TYPE_MODE (vectype
);
5780 /* vect_permute_store_chain requires the group size to be equal to 3 or
5781 be a power of two. */
5782 if (count
!= 3 && exact_log2 (count
) == -1)
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5786 "the size of the group of accesses"
5787 " is not a power of 2 or not eqaul to 3\n");
5791 /* Check that the permutation is supported. */
5792 if (VECTOR_MODE_P (mode
))
5797 unsigned int j0
= 0, j1
= 0, j2
= 0;
5801 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5805 "cannot handle groups of 3 stores for"
5806 " variable-length vectors\n");
5810 vec_perm_builder
sel (nelt
, nelt
, 1);
5811 sel
.quick_grow (nelt
);
5812 vec_perm_indices indices
;
5813 for (j
= 0; j
< 3; j
++)
5815 int nelt0
= ((3 - j
) * nelt
) % 3;
5816 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5817 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5818 for (i
= 0; i
< nelt
; i
++)
5820 if (3 * i
+ nelt0
< nelt
)
5821 sel
[3 * i
+ nelt0
] = j0
++;
5822 if (3 * i
+ nelt1
< nelt
)
5823 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5824 if (3 * i
+ nelt2
< nelt
)
5825 sel
[3 * i
+ nelt2
] = 0;
5827 indices
.new_vector (sel
, 2, nelt
);
5828 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5830 if (dump_enabled_p ())
5831 dump_printf (MSG_MISSED_OPTIMIZATION
,
5832 "permutation op not supported by target.\n");
5836 for (i
= 0; i
< nelt
; i
++)
5838 if (3 * i
+ nelt0
< nelt
)
5839 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5840 if (3 * i
+ nelt1
< nelt
)
5841 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5842 if (3 * i
+ nelt2
< nelt
)
5843 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5845 indices
.new_vector (sel
, 2, nelt
);
5846 if (!can_vec_perm_const_p (mode
, mode
, indices
))
5848 if (dump_enabled_p ())
5849 dump_printf (MSG_MISSED_OPTIMIZATION
,
5850 "permutation op not supported by target.\n");
5858 /* If length is not equal to 3 then only power of 2 is supported. */
5859 gcc_assert (pow2p_hwi (count
));
5860 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5862 /* The encoding has 2 interleaved stepped patterns. */
5863 if(!multiple_p (nelt
, 2))
5865 vec_perm_builder
sel (nelt
, 2, 3);
5867 for (i
= 0; i
< 3; i
++)
5870 sel
[i
* 2 + 1] = i
+ nelt
;
5872 vec_perm_indices
indices (sel
, 2, nelt
);
5873 if (can_vec_perm_const_p (mode
, mode
, indices
))
5875 for (i
= 0; i
< 6; i
++)
5876 sel
[i
] += exact_div (nelt
, 2);
5877 indices
.new_vector (sel
, 2, nelt
);
5878 if (can_vec_perm_const_p (mode
, mode
, indices
))
5884 if (dump_enabled_p ())
5885 dump_printf (MSG_MISSED_OPTIMIZATION
,
5886 "permutation op not supported by target.\n");
5890 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5891 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5894 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5897 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5898 vec_mask_len_store_lanes_optab
, vectype
,
5900 return IFN_MASK_LEN_STORE_LANES
;
5903 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5904 vec_mask_store_lanes_optab
, vectype
,
5906 return IFN_MASK_STORE_LANES
;
5910 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5911 vec_store_lanes_optab
, vectype
, count
))
5912 return IFN_STORE_LANES
;
5918 /* Function vect_permute_store_chain.
5920 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5921 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5922 the data correctly for the stores. Return the final references for stores
5925 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5926 The input is 4 vectors each containing 8 elements. We assign a number to
5927 each element, the input sequence is:
5929 1st vec: 0 1 2 3 4 5 6 7
5930 2nd vec: 8 9 10 11 12 13 14 15
5931 3rd vec: 16 17 18 19 20 21 22 23
5932 4th vec: 24 25 26 27 28 29 30 31
5934 The output sequence should be:
5936 1st vec: 0 8 16 24 1 9 17 25
5937 2nd vec: 2 10 18 26 3 11 19 27
5938 3rd vec: 4 12 20 28 5 13 21 30
5939 4th vec: 6 14 22 30 7 15 23 31
5941 i.e., we interleave the contents of the four vectors in their order.
5943 We use interleave_high/low instructions to create such output. The input of
5944 each interleave_high/low operation is two vectors:
5947 the even elements of the result vector are obtained left-to-right from the
5948 high/low elements of the first vector. The odd elements of the result are
5949 obtained left-to-right from the high/low elements of the second vector.
5950 The output of interleave_high will be: 0 4 1 5
5951 and of interleave_low: 2 6 3 7
5954 The permutation is done in log LENGTH stages. In each stage interleave_high
5955 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5956 where the first argument is taken from the first half of DR_CHAIN and the
5957 second argument from it's second half.
5960 I1: interleave_high (1st vec, 3rd vec)
5961 I2: interleave_low (1st vec, 3rd vec)
5962 I3: interleave_high (2nd vec, 4th vec)
5963 I4: interleave_low (2nd vec, 4th vec)
5965 The output for the first stage is:
5967 I1: 0 16 1 17 2 18 3 19
5968 I2: 4 20 5 21 6 22 7 23
5969 I3: 8 24 9 25 10 26 11 27
5970 I4: 12 28 13 29 14 30 15 31
5972 The output of the second stage, i.e. the final result is:
5974 I1: 0 8 16 24 1 9 17 25
5975 I2: 2 10 18 26 3 11 19 27
5976 I3: 4 12 20 28 5 13 21 30
5977 I4: 6 14 22 30 7 15 23 31. */
5980 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> &dr_chain
,
5981 unsigned int length
,
5982 stmt_vec_info stmt_info
,
5983 gimple_stmt_iterator
*gsi
,
5984 vec
<tree
> *result_chain
)
5986 tree vect1
, vect2
, high
, low
;
5988 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5989 tree perm_mask_low
, perm_mask_high
;
5991 tree perm3_mask_low
, perm3_mask_high
;
5992 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5994 result_chain
->quick_grow (length
);
5995 memcpy (result_chain
->address (), dr_chain
.address (),
5996 length
* sizeof (tree
));
6000 /* vect_grouped_store_supported ensures that this is constant. */
6001 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6002 unsigned int j0
= 0, j1
= 0, j2
= 0;
6004 vec_perm_builder
sel (nelt
, nelt
, 1);
6005 sel
.quick_grow (nelt
);
6006 vec_perm_indices indices
;
6007 for (j
= 0; j
< 3; j
++)
6009 int nelt0
= ((3 - j
) * nelt
) % 3;
6010 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
6011 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
6013 for (i
= 0; i
< nelt
; i
++)
6015 if (3 * i
+ nelt0
< nelt
)
6016 sel
[3 * i
+ nelt0
] = j0
++;
6017 if (3 * i
+ nelt1
< nelt
)
6018 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
6019 if (3 * i
+ nelt2
< nelt
)
6020 sel
[3 * i
+ nelt2
] = 0;
6022 indices
.new_vector (sel
, 2, nelt
);
6023 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6025 for (i
= 0; i
< nelt
; i
++)
6027 if (3 * i
+ nelt0
< nelt
)
6028 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
6029 if (3 * i
+ nelt1
< nelt
)
6030 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
6031 if (3 * i
+ nelt2
< nelt
)
6032 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
6034 indices
.new_vector (sel
, 2, nelt
);
6035 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6037 vect1
= dr_chain
[0];
6038 vect2
= dr_chain
[1];
6040 /* Create interleaving stmt:
6041 low = VEC_PERM_EXPR <vect1, vect2,
6042 {j, nelt, *, j + 1, nelt + j + 1, *,
6043 j + 2, nelt + j + 2, *, ...}> */
6044 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6045 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
6046 vect2
, perm3_mask_low
);
6047 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6050 vect2
= dr_chain
[2];
6051 /* Create interleaving stmt:
6052 low = VEC_PERM_EXPR <vect1, vect2,
6053 {0, 1, nelt + j, 3, 4, nelt + j + 1,
6054 6, 7, nelt + j + 2, ...}> */
6055 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6056 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
6057 vect2
, perm3_mask_high
);
6058 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6059 (*result_chain
)[j
] = data_ref
;
6064 /* If length is not equal to 3 then only power of 2 is supported. */
6065 gcc_assert (pow2p_hwi (length
));
6067 /* The encoding has 2 interleaved stepped patterns. */
6068 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6069 vec_perm_builder
sel (nelt
, 2, 3);
6071 for (i
= 0; i
< 3; i
++)
6074 sel
[i
* 2 + 1] = i
+ nelt
;
6076 vec_perm_indices
indices (sel
, 2, nelt
);
6077 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6079 for (i
= 0; i
< 6; i
++)
6080 sel
[i
] += exact_div (nelt
, 2);
6081 indices
.new_vector (sel
, 2, nelt
);
6082 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6084 for (i
= 0, n
= log_length
; i
< n
; i
++)
6086 for (j
= 0; j
< length
/2; j
++)
6088 vect1
= dr_chain
[j
];
6089 vect2
= dr_chain
[j
+length
/2];
6091 /* Create interleaving stmt:
6092 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
6094 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
6095 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
6096 vect2
, perm_mask_high
);
6097 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6098 (*result_chain
)[2*j
] = high
;
6100 /* Create interleaving stmt:
6101 low = VEC_PERM_EXPR <vect1, vect2,
6102 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
6104 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
6105 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
6106 vect2
, perm_mask_low
);
6107 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6108 (*result_chain
)[2*j
+1] = low
;
6110 memcpy (dr_chain
.address (), result_chain
->address (),
6111 length
* sizeof (tree
));
6116 /* Function vect_setup_realignment
6118 This function is called when vectorizing an unaligned load using
6119 the dr_explicit_realign[_optimized] scheme.
6120 This function generates the following code at the loop prolog:
6123 x msq_init = *(floor(p)); # prolog load
6124 realignment_token = call target_builtin;
6126 x msq = phi (msq_init, ---)
6128 The stmts marked with x are generated only for the case of
6129 dr_explicit_realign_optimized.
6131 The code above sets up a new (vector) pointer, pointing to the first
6132 location accessed by STMT_INFO, and a "floor-aligned" load using that
6133 pointer. It also generates code to compute the "realignment-token"
6134 (if the relevant target hook was defined), and creates a phi-node at the
6135 loop-header bb whose arguments are the result of the prolog-load (created
6136 by this function) and the result of a load that takes place in the loop
6137 (to be created by the caller to this function).
6139 For the case of dr_explicit_realign_optimized:
6140 The caller to this function uses the phi-result (msq) to create the
6141 realignment code inside the loop, and sets up the missing phi argument,
6144 msq = phi (msq_init, lsq)
6145 lsq = *(floor(p')); # load in loop
6146 result = realign_load (msq, lsq, realignment_token);
6148 For the case of dr_explicit_realign:
6150 msq = *(floor(p)); # load in loop
6152 lsq = *(floor(p')); # load in loop
6153 result = realign_load (msq, lsq, realignment_token);
6156 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6157 a memory location that may be unaligned.
6158 BSI - place where new code is to be inserted.
6159 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6163 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6164 target hook, if defined.
6165 Return value - the result of the loop-header phi node. */
6168 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6169 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
6170 enum dr_alignment_support alignment_support_scheme
,
6172 class loop
**at_loop
)
6174 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6175 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6176 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
6177 struct data_reference
*dr
= dr_info
->dr
;
6178 class loop
*loop
= NULL
;
6180 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
6186 tree msq_init
= NULL_TREE
;
6189 tree msq
= NULL_TREE
;
6190 gimple_seq stmts
= NULL
;
6191 bool compute_in_loop
= false;
6192 bool nested_in_vect_loop
= false;
6193 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
6194 class loop
*loop_for_initial_load
= NULL
;
6198 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6199 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
6202 gcc_assert (alignment_support_scheme
== dr_explicit_realign
6203 || alignment_support_scheme
== dr_explicit_realign_optimized
);
6205 /* We need to generate three things:
6206 1. the misalignment computation
6207 2. the extra vector load (for the optimized realignment scheme).
6208 3. the phi node for the two vectors from which the realignment is
6209 done (for the optimized realignment scheme). */
6211 /* 1. Determine where to generate the misalignment computation.
6213 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6214 calculation will be generated by this function, outside the loop (in the
6215 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6216 caller, inside the loop.
6218 Background: If the misalignment remains fixed throughout the iterations of
6219 the loop, then both realignment schemes are applicable, and also the
6220 misalignment computation can be done outside LOOP. This is because we are
6221 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6222 are a multiple of VS (the Vector Size), and therefore the misalignment in
6223 different vectorized LOOP iterations is always the same.
6224 The problem arises only if the memory access is in an inner-loop nested
6225 inside LOOP, which is now being vectorized using outer-loop vectorization.
6226 This is the only case when the misalignment of the memory access may not
6227 remain fixed throughout the iterations of the inner-loop (as explained in
6228 detail in vect_supportable_dr_alignment). In this case, not only is the
6229 optimized realignment scheme not applicable, but also the misalignment
6230 computation (and generation of the realignment token that is passed to
6231 REALIGN_LOAD) have to be done inside the loop.
6233 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6234 or not, which in turn determines if the misalignment is computed inside
6235 the inner-loop, or outside LOOP. */
6237 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
6239 compute_in_loop
= true;
6240 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
6244 /* 2. Determine where to generate the extra vector load.
6246 For the optimized realignment scheme, instead of generating two vector
6247 loads in each iteration, we generate a single extra vector load in the
6248 preheader of the loop, and in each iteration reuse the result of the
6249 vector load from the previous iteration. In case the memory access is in
6250 an inner-loop nested inside LOOP, which is now being vectorized using
6251 outer-loop vectorization, we need to determine whether this initial vector
6252 load should be generated at the preheader of the inner-loop, or can be
6253 generated at the preheader of LOOP. If the memory access has no evolution
6254 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6255 to be generated inside LOOP (in the preheader of the inner-loop). */
6257 if (nested_in_vect_loop
)
6259 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
6260 bool invariant_in_outerloop
=
6261 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
6262 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
6265 loop_for_initial_load
= loop
;
6267 *at_loop
= loop_for_initial_load
;
6269 tree vuse
= NULL_TREE
;
6270 if (loop_for_initial_load
)
6272 pe
= loop_preheader_edge (loop_for_initial_load
);
6273 if (gphi
*vphi
= get_virtual_phi (loop_for_initial_load
->header
))
6274 vuse
= PHI_ARG_DEF_FROM_EDGE (vphi
, pe
);
6277 vuse
= gimple_vuse (gsi_stmt (*gsi
));
6279 /* 3. For the case of the optimized realignment, create the first vector
6280 load at the loop preheader. */
6282 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
6284 /* Create msq_init = *(floor(p1)) in the loop preheader */
6287 gcc_assert (!compute_in_loop
);
6288 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6289 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
6290 loop_for_initial_load
, NULL_TREE
,
6291 &init_addr
, NULL
, &inc
, true);
6292 if (TREE_CODE (ptr
) == SSA_NAME
)
6293 new_temp
= copy_ssa_name (ptr
);
6295 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
6296 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
6297 tree type
= TREE_TYPE (ptr
);
6298 new_stmt
= gimple_build_assign
6299 (new_temp
, BIT_AND_EXPR
, ptr
,
6300 fold_build2 (MINUS_EXPR
, type
,
6301 build_int_cst (type
, 0),
6302 build_int_cst (type
, align
)));
6303 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6304 gcc_assert (!new_bb
);
6306 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
6307 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
6308 vect_copy_ref_info (data_ref
, DR_REF (dr
));
6309 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
6310 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6311 gimple_assign_set_lhs (new_stmt
, new_temp
);
6312 gimple_set_vuse (new_stmt
, vuse
);
6315 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6316 gcc_assert (!new_bb
);
6319 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6321 msq_init
= gimple_assign_lhs (new_stmt
);
6324 /* 4. Create realignment token using a target builtin, if available.
6325 It is done either inside the containing loop, or before LOOP (as
6326 determined above). */
6328 if (targetm
.vectorize
.builtin_mask_for_load
)
6333 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6336 /* Generate the INIT_ADDR computation outside LOOP. */
6337 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
6342 pe
= loop_preheader_edge (loop
);
6343 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
6344 gcc_assert (!new_bb
);
6347 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
6350 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
6351 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
6353 vect_create_destination_var (scalar_dest
,
6354 gimple_call_return_type (new_stmt
));
6355 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6356 gimple_call_set_lhs (new_stmt
, new_temp
);
6358 if (compute_in_loop
)
6359 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6362 /* Generate the misalignment computation outside LOOP. */
6363 pe
= loop_preheader_edge (loop
);
6364 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
6365 gcc_assert (!new_bb
);
6368 *realignment_token
= gimple_call_lhs (new_stmt
);
6370 /* The result of the CALL_EXPR to this builtin is determined from
6371 the value of the parameter and no global variables are touched
6372 which makes the builtin a "const" function. Requiring the
6373 builtin to have the "const" attribute makes it unnecessary
6374 to call mark_call_clobbered. */
6375 gcc_assert (TREE_READONLY (builtin_decl
));
6378 if (alignment_support_scheme
== dr_explicit_realign
)
6381 gcc_assert (!compute_in_loop
);
6382 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
6385 /* 5. Create msq = phi <msq_init, lsq> in loop */
6387 pe
= loop_preheader_edge (containing_loop
);
6388 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6389 msq
= make_ssa_name (vec_dest
);
6390 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
6391 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
6397 /* Function vect_grouped_load_supported.
6399 COUNT is the size of the load group (the number of statements plus the
6400 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6401 only one statement, with a gap of COUNT - 1.
6403 Returns true if a suitable permute exists. */
6406 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
6407 unsigned HOST_WIDE_INT count
)
6409 machine_mode mode
= TYPE_MODE (vectype
);
6411 /* If this is single-element interleaving with an element distance
6412 that leaves unused vector loads around punt - we at least create
6413 very sub-optimal code in that case (and blow up memory,
6415 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
6417 if (dump_enabled_p ())
6418 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6419 "single-element interleaving not supported "
6420 "for not adjacent vector loads\n");
6424 /* vect_permute_load_chain requires the group size to be equal to 3 or
6425 be a power of two. */
6426 if (count
!= 3 && exact_log2 (count
) == -1)
6428 if (dump_enabled_p ())
6429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6430 "the size of the group of accesses"
6431 " is not a power of 2 or not equal to 3\n");
6435 /* Check that the permutation is supported. */
6436 if (VECTOR_MODE_P (mode
))
6442 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
6444 if (dump_enabled_p ())
6445 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6446 "cannot handle groups of 3 loads for"
6447 " variable-length vectors\n");
6451 vec_perm_builder
sel (nelt
, nelt
, 1);
6452 sel
.quick_grow (nelt
);
6453 vec_perm_indices indices
;
6455 for (k
= 0; k
< 3; k
++)
6457 for (i
= 0; i
< nelt
; i
++)
6458 if (3 * i
+ k
< 2 * nelt
)
6462 indices
.new_vector (sel
, 2, nelt
);
6463 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6465 if (dump_enabled_p ())
6466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6467 "shuffle of 3 loads is not supported by"
6471 for (i
= 0, j
= 0; i
< nelt
; i
++)
6472 if (3 * i
+ k
< 2 * nelt
)
6475 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6476 indices
.new_vector (sel
, 2, nelt
);
6477 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6479 if (dump_enabled_p ())
6480 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6481 "shuffle of 3 loads is not supported by"
6490 /* If length is not equal to 3 then only power of 2 is supported. */
6491 gcc_assert (pow2p_hwi (count
));
6492 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
6494 /* The encoding has a single stepped pattern. */
6495 vec_perm_builder
sel (nelt
, 1, 3);
6497 for (i
= 0; i
< 3; i
++)
6499 vec_perm_indices
indices (sel
, 2, nelt
);
6500 if (can_vec_perm_const_p (mode
, mode
, indices
))
6502 for (i
= 0; i
< 3; i
++)
6504 indices
.new_vector (sel
, 2, nelt
);
6505 if (can_vec_perm_const_p (mode
, mode
, indices
))
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6513 "extract even/odd not supported by target\n");
6517 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6518 of type VECTYPE. MASKED_P says whether the masked form is needed.
6519 If it is available and ELSVALS is nonzero store the possible else values
6520 in the vector it points to. */
6523 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
6524 bool masked_p
, vec
<int> *elsvals
)
6526 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6527 vec_mask_len_load_lanes_optab
, vectype
,
6529 return IFN_MASK_LEN_LOAD_LANES
;
6532 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6533 vec_mask_load_lanes_optab
, vectype
,
6535 return IFN_MASK_LOAD_LANES
;
6539 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab
,
6540 vectype
, count
, elsvals
))
6541 return IFN_LOAD_LANES
;
6546 /* Function vect_permute_load_chain.
6548 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6549 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6550 the input data correctly. Return the final references for loads in
6553 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6554 The input is 4 vectors each containing 8 elements. We assign a number to each
6555 element, the input sequence is:
6557 1st vec: 0 1 2 3 4 5 6 7
6558 2nd vec: 8 9 10 11 12 13 14 15
6559 3rd vec: 16 17 18 19 20 21 22 23
6560 4th vec: 24 25 26 27 28 29 30 31
6562 The output sequence should be:
6564 1st vec: 0 4 8 12 16 20 24 28
6565 2nd vec: 1 5 9 13 17 21 25 29
6566 3rd vec: 2 6 10 14 18 22 26 30
6567 4th vec: 3 7 11 15 19 23 27 31
6569 i.e., the first output vector should contain the first elements of each
6570 interleaving group, etc.
6572 We use extract_even/odd instructions to create such output. The input of
6573 each extract_even/odd operation is two vectors
6577 and the output is the vector of extracted even/odd elements. The output of
6578 extract_even will be: 0 2 4 6
6579 and of extract_odd: 1 3 5 7
6582 The permutation is done in log LENGTH stages. In each stage extract_even
6583 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6584 their order. In our example,
6586 E1: extract_even (1st vec, 2nd vec)
6587 E2: extract_odd (1st vec, 2nd vec)
6588 E3: extract_even (3rd vec, 4th vec)
6589 E4: extract_odd (3rd vec, 4th vec)
6591 The output for the first stage will be:
6593 E1: 0 2 4 6 8 10 12 14
6594 E2: 1 3 5 7 9 11 13 15
6595 E3: 16 18 20 22 24 26 28 30
6596 E4: 17 19 21 23 25 27 29 31
6598 In order to proceed and create the correct sequence for the next stage (or
6599 for the correct output, if the second stage is the last one, as in our
6600 example), we first put the output of extract_even operation and then the
6601 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6602 The input for the second stage is:
6604 1st vec (E1): 0 2 4 6 8 10 12 14
6605 2nd vec (E3): 16 18 20 22 24 26 28 30
6606 3rd vec (E2): 1 3 5 7 9 11 13 15
6607 4th vec (E4): 17 19 21 23 25 27 29 31
6609 The output of the second stage:
6611 E1: 0 4 8 12 16 20 24 28
6612 E2: 2 6 10 14 18 22 26 30
6613 E3: 1 5 9 13 17 21 25 29
6614 E4: 3 7 11 15 19 23 27 31
6616 And RESULT_CHAIN after reordering:
6618 1st vec (E1): 0 4 8 12 16 20 24 28
6619 2nd vec (E3): 1 5 9 13 17 21 25 29
6620 3rd vec (E2): 2 6 10 14 18 22 26 30
6621 4th vec (E4): 3 7 11 15 19 23 27 31. */
6624 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6625 unsigned int length
,
6626 stmt_vec_info stmt_info
,
6627 gimple_stmt_iterator
*gsi
,
6628 vec
<tree
> *result_chain
)
6630 tree data_ref
, first_vect
, second_vect
;
6631 tree perm_mask_even
, perm_mask_odd
;
6632 tree perm3_mask_low
, perm3_mask_high
;
6634 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6635 unsigned int i
, j
, log_length
= exact_log2 (length
);
6637 result_chain
->quick_grow (length
);
6638 memcpy (result_chain
->address (), dr_chain
.address (),
6639 length
* sizeof (tree
));
6643 /* vect_grouped_load_supported ensures that this is constant. */
6644 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6647 vec_perm_builder
sel (nelt
, nelt
, 1);
6648 sel
.quick_grow (nelt
);
6649 vec_perm_indices indices
;
6650 for (k
= 0; k
< 3; k
++)
6652 for (i
= 0; i
< nelt
; i
++)
6653 if (3 * i
+ k
< 2 * nelt
)
6657 indices
.new_vector (sel
, 2, nelt
);
6658 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
6660 for (i
= 0, j
= 0; i
< nelt
; i
++)
6661 if (3 * i
+ k
< 2 * nelt
)
6664 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
6665 indices
.new_vector (sel
, 2, nelt
);
6666 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6668 first_vect
= dr_chain
[0];
6669 second_vect
= dr_chain
[1];
6671 /* Create interleaving stmt (low part of):
6672 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6674 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6675 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6676 second_vect
, perm3_mask_low
);
6677 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6679 /* Create interleaving stmt (high part of):
6680 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6682 first_vect
= data_ref
;
6683 second_vect
= dr_chain
[2];
6684 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6685 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6686 second_vect
, perm3_mask_high
);
6687 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6688 (*result_chain
)[k
] = data_ref
;
6693 /* If length is not equal to 3 then only power of 2 is supported. */
6694 gcc_assert (pow2p_hwi (length
));
6696 /* The encoding has a single stepped pattern. */
6697 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6698 vec_perm_builder
sel (nelt
, 1, 3);
6700 for (i
= 0; i
< 3; ++i
)
6702 vec_perm_indices
indices (sel
, 2, nelt
);
6703 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6705 for (i
= 0; i
< 3; ++i
)
6707 indices
.new_vector (sel
, 2, nelt
);
6708 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6710 for (i
= 0; i
< log_length
; i
++)
6712 for (j
= 0; j
< length
; j
+= 2)
6714 first_vect
= dr_chain
[j
];
6715 second_vect
= dr_chain
[j
+1];
6717 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6718 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6719 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6720 first_vect
, second_vect
,
6722 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6723 (*result_chain
)[j
/2] = data_ref
;
6725 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6726 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6727 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6728 first_vect
, second_vect
,
6730 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6731 (*result_chain
)[j
/2+length
/2] = data_ref
;
6733 memcpy (dr_chain
.address (), result_chain
->address (),
6734 length
* sizeof (tree
));
6739 /* Function vect_shift_permute_load_chain.
6741 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6742 sequence of stmts to reorder the input data accordingly.
6743 Return the final references for loads in RESULT_CHAIN.
6744 Return true if successed, false otherwise.
6746 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6747 The input is 3 vectors each containing 8 elements. We assign a
6748 number to each element, the input sequence is:
6750 1st vec: 0 1 2 3 4 5 6 7
6751 2nd vec: 8 9 10 11 12 13 14 15
6752 3rd vec: 16 17 18 19 20 21 22 23
6754 The output sequence should be:
6756 1st vec: 0 3 6 9 12 15 18 21
6757 2nd vec: 1 4 7 10 13 16 19 22
6758 3rd vec: 2 5 8 11 14 17 20 23
6760 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6762 First we shuffle all 3 vectors to get correct elements order:
6764 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6765 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6766 3rd vec: (16 19 22) (17 20 23) (18 21)
6768 Next we unite and shift vector 3 times:
6771 shift right by 6 the concatenation of:
6772 "1st vec" and "2nd vec"
6773 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6774 "2nd vec" and "3rd vec"
6775 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6776 "3rd vec" and "1st vec"
6777 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6780 So that now new vectors are:
6782 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6783 2nd vec: (10 13) (16 19 22) (17 20 23)
6784 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6787 shift right by 5 the concatenation of:
6788 "1st vec" and "3rd vec"
6789 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6790 "2nd vec" and "1st vec"
6791 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6792 "3rd vec" and "2nd vec"
6793 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6796 So that now new vectors are:
6798 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6799 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6800 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6803 shift right by 5 the concatenation of:
6804 "1st vec" and "1st vec"
6805 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6806 shift right by 3 the concatenation of:
6807 "2nd vec" and "2nd vec"
6808 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6811 So that now all vectors are READY:
6812 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6813 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6814 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6816 This algorithm is faster than one in vect_permute_load_chain if:
6817 1. "shift of a concatination" is faster than general permutation.
6819 2. The TARGET machine can't execute vector instructions in parallel.
6820 This is because each step of the algorithm depends on previous.
6821 The algorithm in vect_permute_load_chain is much more parallel.
6823 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6827 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6828 unsigned int length
,
6829 stmt_vec_info stmt_info
,
6830 gimple_stmt_iterator
*gsi
,
6831 vec
<tree
> *result_chain
)
6833 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6834 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6835 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6838 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6839 machine_mode vmode
= TYPE_MODE (vectype
);
6841 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6843 unsigned HOST_WIDE_INT nelt
, vf
;
6844 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6845 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6846 /* Not supported for variable-length vectors. */
6849 vec_perm_builder
sel (nelt
, nelt
, 1);
6850 sel
.quick_grow (nelt
);
6852 result_chain
->quick_grow (length
);
6853 memcpy (result_chain
->address (), dr_chain
.address (),
6854 length
* sizeof (tree
));
6856 if (pow2p_hwi (length
) && vf
> 4)
6858 unsigned int j
, log_length
= exact_log2 (length
);
6859 for (i
= 0; i
< nelt
/ 2; ++i
)
6861 for (i
= 0; i
< nelt
/ 2; ++i
)
6862 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6863 vec_perm_indices
indices (sel
, 2, nelt
);
6864 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6868 "shuffle of 2 fields structure is not \
6869 supported by target\n");
6872 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6874 for (i
= 0; i
< nelt
/ 2; ++i
)
6876 for (i
= 0; i
< nelt
/ 2; ++i
)
6877 sel
[nelt
/ 2 + i
] = i
* 2;
6878 indices
.new_vector (sel
, 2, nelt
);
6879 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6881 if (dump_enabled_p ())
6882 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6883 "shuffle of 2 fields structure is not \
6884 supported by target\n");
6887 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6889 /* Generating permutation constant to shift all elements.
6890 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6891 for (i
= 0; i
< nelt
; i
++)
6892 sel
[i
] = nelt
/ 2 + i
;
6893 indices
.new_vector (sel
, 2, nelt
);
6894 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6898 "shift permutation is not supported by target\n");
6901 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6903 /* Generating permutation constant to select vector from 2.
6904 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6905 for (i
= 0; i
< nelt
/ 2; i
++)
6907 for (i
= nelt
/ 2; i
< nelt
; i
++)
6909 indices
.new_vector (sel
, 2, nelt
);
6910 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6912 if (dump_enabled_p ())
6913 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6914 "select is not supported by target\n");
6917 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6919 for (i
= 0; i
< log_length
; i
++)
6921 for (j
= 0; j
< length
; j
+= 2)
6923 first_vect
= dr_chain
[j
];
6924 second_vect
= dr_chain
[j
+ 1];
6926 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6927 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6928 first_vect
, first_vect
,
6930 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6933 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6934 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6935 second_vect
, second_vect
,
6937 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6940 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6941 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6942 vect
[0], vect
[1], shift1_mask
);
6943 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6944 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6946 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6947 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6948 vect
[0], vect
[1], select_mask
);
6949 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6950 (*result_chain
)[j
/2] = data_ref
;
6952 memcpy (dr_chain
.address (), result_chain
->address (),
6953 length
* sizeof (tree
));
6957 if (length
== 3 && vf
> 2)
6959 unsigned int k
= 0, l
= 0;
6961 /* Generating permutation constant to get all elements in rigth order.
6962 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6963 for (i
= 0; i
< nelt
; i
++)
6965 if (3 * k
+ (l
% 3) >= nelt
)
6968 l
+= (3 - (nelt
% 3));
6970 sel
[i
] = 3 * k
+ (l
% 3);
6973 vec_perm_indices
indices (sel
, 2, nelt
);
6974 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6976 if (dump_enabled_p ())
6977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6978 "shuffle of 3 fields structure is not \
6979 supported by target\n");
6982 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6984 /* Generating permutation constant to shift all elements.
6985 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6986 for (i
= 0; i
< nelt
; i
++)
6987 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6988 indices
.new_vector (sel
, 2, nelt
);
6989 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
6991 if (dump_enabled_p ())
6992 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6993 "shift permutation is not supported by target\n");
6996 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6998 /* Generating permutation constant to shift all elements.
6999 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7000 for (i
= 0; i
< nelt
; i
++)
7001 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
7002 indices
.new_vector (sel
, 2, nelt
);
7003 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
7005 if (dump_enabled_p ())
7006 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7007 "shift permutation is not supported by target\n");
7010 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
7012 /* Generating permutation constant to shift all elements.
7013 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
7014 for (i
= 0; i
< nelt
; i
++)
7015 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
7016 indices
.new_vector (sel
, 2, nelt
);
7017 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
7019 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7021 "shift permutation is not supported by target\n");
7024 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
7026 /* Generating permutation constant to shift all elements.
7027 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7028 for (i
= 0; i
< nelt
; i
++)
7029 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
7030 indices
.new_vector (sel
, 2, nelt
);
7031 if (!can_vec_perm_const_p (vmode
, vmode
, indices
))
7033 if (dump_enabled_p ())
7034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7035 "shift permutation is not supported by target\n");
7038 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
7040 for (k
= 0; k
< 3; k
++)
7042 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
7043 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
7044 dr_chain
[k
], dr_chain
[k
],
7046 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
7050 for (k
= 0; k
< 3; k
++)
7052 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
7053 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
7054 vect
[k
% 3], vect
[(k
+ 1) % 3],
7056 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
7057 vect_shift
[k
] = data_ref
;
7060 for (k
= 0; k
< 3; k
++)
7062 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
7063 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
7064 vect_shift
[(4 - k
) % 3],
7065 vect_shift
[(3 - k
) % 3],
7067 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
7071 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
7073 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
7074 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
7075 vect
[0], shift3_mask
);
7076 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
7077 (*result_chain
)[nelt
% 3] = data_ref
;
7079 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
7080 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
7081 vect
[1], shift4_mask
);
7082 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
7083 (*result_chain
)[0] = data_ref
;
7089 /* Function vect_transform_grouped_load.
7091 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
7092 to perform their permutation and ascribe the result vectorized statements to
7093 the scalar statements.
7097 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
7099 int size
, gimple_stmt_iterator
*gsi
)
7102 vec
<tree
> result_chain
= vNULL
;
7104 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
7105 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
7106 vectors, that are ready for vector computation. */
7107 result_chain
.create (size
);
7109 /* If reassociation width for vector type is 2 or greater target machine can
7110 execute 2 or more vector instructions in parallel. Otherwise try to
7111 get chain for loads group using vect_shift_permute_load_chain. */
7112 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
7113 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
7115 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
7116 gsi
, &result_chain
))
7117 vect_permute_load_chain (vinfo
, dr_chain
,
7118 size
, stmt_info
, gsi
, &result_chain
);
7119 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
7120 result_chain
.release ();
7123 /* RESULT_CHAIN contains the output of a group of grouped loads that were
7124 generated as part of the vectorization of STMT_INFO. Assign the statement
7125 for each vector to the associated scalar statement. */
7128 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
7129 vec
<tree
> result_chain
)
7131 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
7132 unsigned int i
, gap_count
;
7135 /* Put a permuted data-ref in the VECTORIZED_STMT field.
7136 Since we scan the chain starting from it's first node, their order
7137 corresponds the order of data-refs in RESULT_CHAIN. */
7138 stmt_vec_info next_stmt_info
= first_stmt_info
;
7140 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
7142 if (!next_stmt_info
)
7145 /* Skip the gaps. Loads created for the gaps will be removed by dead
7146 code elimination pass later. No need to check for the first stmt in
7147 the group, since it always exists.
7148 DR_GROUP_GAP is the number of steps in elements from the previous
7149 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
7150 correspond to the gaps. */
7151 if (next_stmt_info
!= first_stmt_info
7152 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
7158 /* ??? The following needs cleanup after the removal of
7159 DR_GROUP_SAME_DR_STMT. */
7162 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
7163 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7164 copies, and we put the new vector statement last. */
7165 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
7167 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
7173 /* Function vect_force_dr_alignment_p.
7175 Returns whether the alignment of a DECL can be forced to be aligned
7176 on ALIGNMENT bit boundary. */
7179 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
7184 if (decl_in_symtab_p (decl
)
7185 && !symtab_node::get (decl
)->can_increase_alignment_p ())
7188 if (TREE_STATIC (decl
))
7189 return (known_le (alignment
,
7190 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
7192 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
7195 /* Return whether the data reference DR_INFO is supported with respect to its
7197 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7198 it is aligned, i.e., check if it is possible to vectorize it with different
7201 enum dr_alignment_support
7202 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
7203 tree vectype
, int misalignment
)
7205 data_reference
*dr
= dr_info
->dr
;
7206 stmt_vec_info stmt_info
= dr_info
->stmt
;
7207 machine_mode mode
= TYPE_MODE (vectype
);
7208 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7209 class loop
*vect_loop
= NULL
;
7210 bool nested_in_vect_loop
= false;
7212 if (misalignment
== 0)
7214 else if (dr_info
->need_peeling_for_alignment
)
7215 return dr_unaligned_unsupported
;
7217 /* For now assume all conditional loads/stores support unaligned
7218 access without any special code. */
7219 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
7220 if (gimple_call_internal_p (stmt
)
7221 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
7222 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
7223 return dr_unaligned_supported
;
7227 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7228 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
7231 /* Possibly unaligned access. */
7233 /* We can choose between using the implicit realignment scheme (generating
7234 a misaligned_move stmt) and the explicit realignment scheme (generating
7235 aligned loads with a REALIGN_LOAD). There are two variants to the
7236 explicit realignment scheme: optimized, and unoptimized.
7237 We can optimize the realignment only if the step between consecutive
7238 vector loads is equal to the vector size. Since the vector memory
7239 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7240 is guaranteed that the misalignment amount remains the same throughout the
7241 execution of the vectorized loop. Therefore, we can create the
7242 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7243 at the loop preheader.
7245 However, in the case of outer-loop vectorization, when vectorizing a
7246 memory access in the inner-loop nested within the LOOP that is now being
7247 vectorized, while it is guaranteed that the misalignment of the
7248 vectorized memory access will remain the same in different outer-loop
7249 iterations, it is *not* guaranteed that is will remain the same throughout
7250 the execution of the inner-loop. This is because the inner-loop advances
7251 with the original scalar step (and not in steps of VS). If the inner-loop
7252 step happens to be a multiple of VS, then the misalignment remains fixed
7253 and we can use the optimized realignment scheme. For example:
7259 When vectorizing the i-loop in the above example, the step between
7260 consecutive vector loads is 1, and so the misalignment does not remain
7261 fixed across the execution of the inner-loop, and the realignment cannot
7262 be optimized (as illustrated in the following pseudo vectorized loop):
7264 for (i=0; i<N; i+=4)
7265 for (j=0; j<M; j++){
7266 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7267 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7268 // (assuming that we start from an aligned address).
7271 We therefore have to use the unoptimized realignment scheme:
7273 for (i=0; i<N; i+=4)
7274 for (j=k; j<M; j+=4)
7275 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7276 // that the misalignment of the initial address is
7279 The loop can then be vectorized as follows:
7281 for (k=0; k<4; k++){
7282 rt = get_realignment_token (&vp[k]);
7283 for (i=0; i<N; i+=4){
7285 for (j=k; j<M; j+=4){
7287 va = REALIGN_LOAD <v1,v2,rt>;
7294 if (DR_IS_READ (dr
))
7296 if (can_implement_p (vec_realign_load_optab
, mode
)
7297 && (!targetm
.vectorize
.builtin_mask_for_load
7298 || targetm
.vectorize
.builtin_mask_for_load ()))
7300 /* If we are doing SLP then the accesses need not have the
7301 same alignment, instead it depends on the SLP group size. */
7303 && STMT_SLP_TYPE (stmt_info
)
7304 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
7305 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7307 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
7308 TYPE_VECTOR_SUBPARTS (vectype
)))
7310 else if (!loop_vinfo
7311 || (nested_in_vect_loop
7312 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
7313 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
7314 return dr_explicit_realign
;
7316 return dr_explicit_realign_optimized
;
7320 bool is_packed
= false;
7321 tree type
= TREE_TYPE (DR_REF (dr
));
7322 if (misalignment
== DR_MISALIGNMENT_UNKNOWN
)
7323 is_packed
= not_size_aligned (DR_REF (dr
));
7324 if (targetm
.vectorize
.support_vector_misalignment (mode
, type
, misalignment
,
7326 return dr_unaligned_supported
;
7329 return dr_unaligned_unsupported
;