AVR: Provide built-ins for strlen where the string lives in some AS.
[gcc.git] / gcc / tree-vect-data-refs.cc
blob6eda40267bd1382938a77826d11f20dcc959a166
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "pretty-print.h"
38 #include "alias.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
41 #include "tree-eh.h"
42 #include "gimplify.h"
43 #include "gimple-iterator.h"
44 #include "gimplify-me.h"
45 #include "tree-ssa-loop-ivopts.h"
46 #include "tree-ssa-loop-manip.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "expr.h"
52 #include "builtins.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 #include "gimple-fold.h"
58 #include "optabs-query.h"
60 /* Return true if load- or store-lanes optab OPTAB is implemented for
61 COUNT vectors of type VECTYPE. NAME is the name of OPTAB.
63 If it is implemented and ELSVALS is nonzero store the possible else
64 values in the vector it points to. */
66 static bool
67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
68 tree vectype, unsigned HOST_WIDE_INT count,
69 vec<int> *elsvals = nullptr)
71 machine_mode mode, array_mode;
72 bool limit_p;
74 mode = TYPE_MODE (vectype);
75 if (!targetm.array_mode (mode, count).exists (&array_mode))
77 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
78 limit_p = !targetm.array_mode_supported_p (mode, count);
79 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "no array mode for %s[%wu]\n",
84 GET_MODE_NAME (mode), count);
85 return false;
89 enum insn_code icode;
90 if ((icode = convert_optab_handler (optab, array_mode, mode))
91 == CODE_FOR_nothing)
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
95 "cannot use %s<%s><%s>\n", name,
96 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
97 return false;
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_NOTE, vect_location,
102 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
103 GET_MODE_NAME (mode));
105 if (elsvals)
106 get_supported_else_vals (icode,
107 internal_fn_else_index (IFN_MASK_LEN_LOAD_LANES),
108 *elsvals);
110 return true;
113 /* Helper function to identify a simd clone call. If this is a call to a
114 function with simd clones then return the corresponding cgraph_node,
115 otherwise return NULL. */
117 static cgraph_node*
118 simd_clone_call_p (gimple *stmt)
120 gcall *call = dyn_cast <gcall *> (stmt);
121 if (!call)
122 return NULL;
124 tree fndecl = NULL_TREE;
125 if (gimple_call_internal_p (call, IFN_MASK_CALL))
126 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
127 else
128 fndecl = gimple_call_fndecl (stmt);
130 if (fndecl == NULL_TREE)
131 return NULL;
133 cgraph_node *node = cgraph_node::get (fndecl);
134 if (node && node->simd_clones != NULL)
135 return node;
137 return NULL;
142 /* Return the smallest scalar part of STMT_INFO.
143 This is used to determine the vectype of the stmt. We generally set the
144 vectype according to the type of the result (lhs). For stmts whose
145 result-type is different than the type of the arguments (e.g., demotion,
146 promotion), vectype will be reset appropriately (later). Note that we have
147 to visit the smallest datatype in this function, because that determines the
148 VF. If the smallest datatype in the loop is present only as the rhs of a
149 promotion operation - we'd miss it.
150 Such a case, where a variable of this datatype does not appear in the lhs
151 anywhere in the loop, can only occur if it's an invariant: e.g.:
152 'int_x = (int) short_inv', which we'd expect to have been optimized away by
153 invariant motion. However, we cannot rely on invariant motion to always
154 take invariants out of the loop, and so in the case of promotion we also
155 have to check the rhs.
156 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
157 types. */
159 tree
160 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
162 HOST_WIDE_INT lhs, rhs;
164 /* During the analysis phase, this function is called on arbitrary
165 statements that might not have scalar results. */
166 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
167 return scalar_type;
169 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
171 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
172 if (assign)
174 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
175 if (gimple_assign_cast_p (assign)
176 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
177 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
178 || gimple_assign_rhs_code (assign) == SAD_EXPR
179 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
180 || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
181 || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
182 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
183 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
185 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
187 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
188 if (rhs < lhs)
189 scalar_type = rhs_type;
192 else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
194 auto clone = node->simd_clones->simdclone;
195 for (unsigned int i = 0; i < clone->nargs; ++i)
197 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
199 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
200 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
201 if (rhs < lhs)
203 scalar_type = arg_scalar_type;
204 lhs = rhs;
209 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
211 unsigned int i = 0;
212 if (gimple_call_internal_p (call))
214 internal_fn ifn = gimple_call_internal_fn (call);
215 if (internal_load_fn_p (ifn))
216 /* For loads the LHS type does the trick. */
217 i = ~0U;
218 else if (internal_store_fn_p (ifn))
220 /* For stores use the tyep of the stored value. */
221 i = internal_fn_stored_value_index (ifn);
222 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
223 i = ~0U;
225 else if (internal_fn_mask_index (ifn) == 0)
226 i = 1;
228 if (i < gimple_call_num_args (call))
230 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
231 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
233 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
234 if (rhs < lhs)
235 scalar_type = rhs_type;
240 return scalar_type;
244 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
245 tested at run-time. Return TRUE if DDR was successfully inserted.
246 Return false if versioning is not supported. */
248 static opt_result
249 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
251 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
253 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
254 return opt_result::failure_at (vect_location,
255 "will not create alias checks, as"
256 " --param vect-max-version-for-alias-checks"
257 " == 0\n");
259 opt_result res
260 = runtime_alias_check_p (ddr, loop,
261 optimize_loop_nest_for_speed_p (loop));
262 if (!res)
263 return res;
265 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
266 return opt_result::success ();
269 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
271 static void
272 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
274 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
275 for (unsigned int i = 0; i < checks.length(); ++i)
276 if (checks[i] == value)
277 return;
279 if (dump_enabled_p ())
280 dump_printf_loc (MSG_NOTE, vect_location,
281 "need run-time check that %T is nonzero\n",
282 value);
283 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
286 /* Return true if we know that the order of vectorized DR_INFO_A and
287 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
288 DR_INFO_B. At least one of the accesses is a write. */
290 static bool
291 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
293 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
294 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
296 /* Single statements are always kept in their original order. */
297 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
298 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
299 return true;
301 /* If there is a loop invariant read involved we might vectorize it in
302 the prologue, breaking scalar oder with respect to the in-loop store. */
303 if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
304 || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
305 return false;
307 /* STMT_A and STMT_B belong to overlapping groups. All loads are
308 emitted at the position of the first scalar load.
309 Stores in a group are emitted at the position of the last scalar store.
310 Compute that position and check whether the resulting order matches
311 the current one. */
312 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
313 if (il_a)
315 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
316 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
317 s = DR_GROUP_NEXT_ELEMENT (s))
318 il_a = get_later_stmt (il_a, s);
319 else /* DR_IS_READ */
320 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
321 s = DR_GROUP_NEXT_ELEMENT (s))
322 if (get_later_stmt (il_a, s) == il_a)
323 il_a = s;
325 else
326 il_a = stmtinfo_a;
327 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
328 if (il_b)
330 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
331 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
332 s = DR_GROUP_NEXT_ELEMENT (s))
333 il_b = get_later_stmt (il_b, s);
334 else /* DR_IS_READ */
335 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
336 s = DR_GROUP_NEXT_ELEMENT (s))
337 if (get_later_stmt (il_b, s) == il_b)
338 il_b = s;
340 else
341 il_b = stmtinfo_b;
342 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
343 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
346 /* A subroutine of vect_analyze_data_ref_dependence. Handle
347 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
348 distances. These distances are conservatively correct but they don't
349 reflect a guaranteed dependence.
351 Return true if this function does all the work necessary to avoid
352 an alias or false if the caller should use the dependence distances
353 to limit the vectorization factor in the usual way. LOOP_DEPTH is
354 the depth of the loop described by LOOP_VINFO and the other arguments
355 are as for vect_analyze_data_ref_dependence. */
357 static bool
358 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
359 loop_vec_info loop_vinfo,
360 int loop_depth, unsigned int *max_vf)
362 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
363 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
365 int dist = dist_v[loop_depth];
366 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
368 /* If the user asserted safelen >= DIST consecutive iterations
369 can be executed concurrently, assume independence.
371 ??? An alternative would be to add the alias check even
372 in this case, and vectorize the fallback loop with the
373 maximum VF set to safelen. However, if the user has
374 explicitly given a length, it's less likely that that
375 would be a win. */
376 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
378 if ((unsigned int) loop->safelen < *max_vf)
379 *max_vf = loop->safelen;
380 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
381 continue;
384 /* For dependence distances of 2 or more, we have the option
385 of limiting VF or checking for an alias at runtime.
386 Prefer to check at runtime if we can, to avoid limiting
387 the VF unnecessarily when the bases are in fact independent.
389 Note that the alias checks will be removed if the VF ends up
390 being small enough. */
391 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
392 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
393 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
394 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
395 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
398 return true;
402 /* Function vect_analyze_data_ref_dependence.
404 FIXME: I needed to change the sense of the returned flag.
406 Return FALSE if there (might) exist a dependence between a memory-reference
407 DRA and a memory-reference DRB. When versioning for alias may check a
408 dependence at run-time, return TRUE. Adjust *MAX_VF according to
409 the data dependence. */
411 static opt_result
412 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
413 loop_vec_info loop_vinfo,
414 unsigned int *max_vf)
416 unsigned int i;
417 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
418 struct data_reference *dra = DDR_A (ddr);
419 struct data_reference *drb = DDR_B (ddr);
420 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
421 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
422 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
423 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
424 lambda_vector dist_v;
425 unsigned int loop_depth;
427 /* If user asserted safelen consecutive iterations can be
428 executed concurrently, assume independence. */
429 auto apply_safelen = [&]()
431 if (loop->safelen >= 2)
433 if ((unsigned int) loop->safelen < *max_vf)
434 *max_vf = loop->safelen;
435 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
436 return true;
438 return false;
441 /* In loop analysis all data references should be vectorizable. */
442 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
443 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
444 gcc_unreachable ();
446 /* Independent data accesses. */
447 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
448 return opt_result::success ();
450 if (dra == drb
451 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
452 return opt_result::success ();
454 /* We do not have to consider dependences between accesses that belong
455 to the same group, unless the stride could be smaller than the
456 group size. */
457 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
458 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
459 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
460 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
461 return opt_result::success ();
463 /* Even if we have an anti-dependence then, as the vectorized loop covers at
464 least two scalar iterations, there is always also a true dependence.
465 As the vectorizer does not re-order loads and stores we can ignore
466 the anti-dependence if TBAA can disambiguate both DRs similar to the
467 case with known negative distance anti-dependences (positive
468 distance anti-dependences would violate TBAA constraints). */
469 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
470 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
471 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
472 get_alias_set (DR_REF (drb))))
473 return opt_result::success ();
475 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
476 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
478 if (apply_safelen ())
479 return opt_result::success ();
481 return opt_result::failure_at
482 (stmtinfo_a->stmt,
483 "possible alias involving gather/scatter between %T and %T\n",
484 DR_REF (dra), DR_REF (drb));
487 /* Unknown data dependence. */
488 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
490 if (apply_safelen ())
491 return opt_result::success ();
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
495 "versioning for alias required: "
496 "can't determine dependence between %T and %T\n",
497 DR_REF (dra), DR_REF (drb));
499 /* Add to list of ddrs that need to be tested at run-time. */
500 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
503 /* Known data dependence. */
504 if (DDR_NUM_DIST_VECTS (ddr) == 0)
506 if (apply_safelen ())
507 return opt_result::success ();
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
511 "versioning for alias required: "
512 "bad dist vector for %T and %T\n",
513 DR_REF (dra), DR_REF (drb));
514 /* Add to list of ddrs that need to be tested at run-time. */
515 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
518 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
520 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
521 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
522 loop_depth, max_vf))
523 return opt_result::success ();
525 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
527 int dist = dist_v[loop_depth];
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location,
531 "dependence distance = %d.\n", dist);
533 if (dist == 0)
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE, vect_location,
537 "dependence distance == 0 between %T and %T\n",
538 DR_REF (dra), DR_REF (drb));
540 /* When we perform grouped accesses and perform implicit CSE
541 by detecting equal accesses and doing disambiguation with
542 runtime alias tests like for
543 .. = a[i];
544 .. = a[i+1];
545 a[i] = ..;
546 a[i+1] = ..;
547 *p = ..;
548 .. = a[i];
549 .. = a[i+1];
550 where we will end up loading { a[i], a[i+1] } once, make
551 sure that inserting group loads before the first load and
552 stores after the last store will do the right thing.
553 Similar for groups like
554 a[i] = ...;
555 ... = a[i];
556 a[i+1] = ...;
557 where loads from the group interleave with the store. */
558 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
559 return opt_result::failure_at (stmtinfo_a->stmt,
560 "READ_WRITE dependence"
561 " in interleaving.\n");
563 if (loop->safelen < 2)
565 tree indicator = dr_zero_step_indicator (dra);
566 if (!indicator || integer_zerop (indicator))
567 return opt_result::failure_at (stmtinfo_a->stmt,
568 "access also has a zero step\n");
569 else if (TREE_CODE (indicator) != INTEGER_CST)
570 vect_check_nonzero_value (loop_vinfo, indicator);
572 continue;
575 if (dist > 0 && DDR_REVERSED_P (ddr))
577 /* If DDR_REVERSED_P the order of the data-refs in DDR was
578 reversed (to make distance vector positive), and the actual
579 distance is negative. */
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "dependence distance negative.\n");
583 /* When doing outer loop vectorization, we need to check if there is
584 a backward dependence at the inner loop level if the dependence
585 at the outer loop is reversed. See PR81740. */
586 if (nested_in_vect_loop_p (loop, stmtinfo_a)
587 || nested_in_vect_loop_p (loop, stmtinfo_b))
589 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
590 DDR_LOOP_NEST (ddr));
591 if (dist_v[inner_depth] < 0)
592 return opt_result::failure_at (stmtinfo_a->stmt,
593 "not vectorized, dependence "
594 "between data-refs %T and %T\n",
595 DR_REF (dra), DR_REF (drb));
597 /* Record a negative dependence distance to later limit the
598 amount of stmt copying / unrolling we can perform.
599 Only need to handle read-after-write dependence. */
600 if (DR_IS_READ (drb)
601 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
602 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
603 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
604 continue;
607 unsigned int abs_dist = abs (dist);
608 if (abs_dist >= 2 && abs_dist < *max_vf)
610 /* The dependence distance requires reduction of the maximal
611 vectorization factor. */
612 *max_vf = abs_dist;
613 if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE, vect_location,
615 "adjusting maximal vectorization factor to %i\n",
616 *max_vf);
619 if (abs_dist >= *max_vf)
621 /* Dependence distance does not create dependence, as far as
622 vectorization is concerned, in this case. */
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE, vect_location,
625 "dependence distance >= VF.\n");
626 continue;
629 return opt_result::failure_at (stmtinfo_a->stmt,
630 "not vectorized, possible dependence "
631 "between data-refs %T and %T\n",
632 DR_REF (dra), DR_REF (drb));
635 return opt_result::success ();
638 /* Function vect_analyze_early_break_dependences.
640 Examine all the data references in the loop and make sure that if we have
641 multiple exits that we are able to safely move stores such that they become
642 safe for vectorization. The function also calculates the place where to move
643 the instructions to and computes what the new vUSE chain should be.
645 This works in tandem with the CFG that will be produced by
646 slpeel_tree_duplicate_loop_to_edge_cfg later on.
648 This function tries to validate whether an early break vectorization
649 is possible for the current instruction sequence. Returns True i
650 possible, otherwise False.
652 Requirements:
653 - Any memory access must be to a fixed size buffer.
654 - There must not be any loads and stores to the same object.
655 - Multiple loads are allowed as long as they don't alias.
657 NOTE:
658 This implementation is very conservative. Any overlapping loads/stores
659 that take place before the early break statement gets rejected aside from
660 WAR dependencies.
662 i.e.:
664 a[i] = 8
665 c = a[i]
666 if (b[i])
669 is not allowed, but
671 c = a[i]
672 a[i] = 8
673 if (b[i])
676 is which is the common case. */
678 static opt_result
679 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
681 DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
683 /* List of all load data references found during traversal. */
684 auto_vec<data_reference *> bases;
685 basic_block dest_bb = NULL;
687 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
688 class loop *loop_nest = loop_outer (loop);
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE, vect_location,
692 "loop contains multiple exits, analyzing"
693 " statement dependencies.\n");
695 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE, vect_location,
698 "alternate exit has been chosen as main exit.\n");
700 /* Since we don't support general control flow, the location we'll move the
701 side-effects to is always the latch connected exit. When we support
702 general control flow we can do better but for now this is fine. Move
703 side-effects to the in-loop destination of the last early exit. For the
704 PEELED case we move the side-effects to the latch block as this is
705 guaranteed to be the last block to be executed when a vector iteration
706 finished. */
707 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
708 dest_bb = loop->latch;
709 else
710 dest_bb = single_pred (loop->latch);
712 /* We start looking from dest_bb, for the non-PEELED case we don't want to
713 move any stores already present, but we do want to read and validate the
714 loads. */
715 basic_block bb = dest_bb;
717 /* We move stores across all loads to the beginning of dest_bb, so
718 the first block processed below doesn't need dependence checking. */
719 bool check_deps = false;
723 gimple_stmt_iterator gsi = gsi_last_bb (bb);
725 /* Now analyze all the remaining statements and try to determine which
726 instructions are allowed/needed to be moved. */
727 while (!gsi_end_p (gsi))
729 gimple *stmt = gsi_stmt (gsi);
730 gsi_prev (&gsi);
731 if (is_gimple_debug (stmt))
732 continue;
734 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
735 auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
736 if (!dr_ref)
737 continue;
739 /* We know everything below dest_bb is safe since we know we
740 had a full vector iteration when reaching it. Either by
741 the loop entry / IV exit test being last or because this
742 is the loop latch itself. */
743 if (!check_deps)
744 continue;
746 /* Check if vector accesses to the object will be within bounds.
747 must be a constant or assume loop will be versioned or niters
748 bounded by VF so accesses are within range. We only need to check
749 the reads since writes are moved to a safe place where if we get
750 there we know they are safe to perform. */
751 if (DR_IS_READ (dr_ref)
752 && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
754 if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)
755 || STMT_VINFO_STRIDED_P (stmt_vinfo))
757 const char *msg
758 = "early break not supported: cannot peel "
759 "for alignment, vectorization would read out of "
760 "bounds at %G";
761 return opt_result::failure_at (stmt, msg, stmt);
764 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
765 dr_info->need_peeling_for_alignment = true;
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_NOTE, vect_location,
769 "marking DR (read) as needing peeling for "
770 "alignment at %G", stmt);
773 if (DR_IS_READ (dr_ref))
774 bases.safe_push (dr_ref);
775 else if (DR_IS_WRITE (dr_ref))
777 /* We are moving writes down in the CFG. To be sure that this
778 is valid after vectorization we have to check all the loads
779 we are sinking the stores past to see if any of them may
780 alias or are the same object.
782 Same objects will not be an issue because unless the store
783 is marked volatile the value can be forwarded. If the
784 store is marked volatile we don't vectorize the loop
785 anyway.
787 That leaves the check for aliasing. We don't really need
788 to care about the stores aliasing with each other since the
789 stores are moved in order so the effects are still observed
790 correctly. This leaves the check for WAR dependencies
791 which we would be introducing here if the DR can alias.
792 The check is quadratic in loads/stores but I have not found
793 a better API to do this. I believe all loads and stores
794 must be checked. We also must check them when we
795 encountered the store, since we don't care about loads past
796 the store. */
798 for (auto dr_read : bases)
799 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803 vect_location,
804 "early breaks not supported: "
805 "overlapping loads and stores "
806 "found before the break "
807 "statement.\n");
809 return opt_result::failure_at (stmt,
810 "can't safely apply code motion to dependencies"
811 " to vectorize the early exit. %G may alias with"
812 " %G\n", stmt, dr_read->stmt);
816 if (gimple_vdef (stmt))
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "==> recording stmt %G", stmt);
822 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
824 else if (gimple_vuse (stmt))
826 LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location,
829 "marked statement for vUSE update: %G", stmt);
833 if (!single_pred_p (bb))
835 gcc_assert (bb == loop->header);
836 break;
839 /* If we possibly sink through a virtual PHI make sure to elide that. */
840 if (gphi *vphi = get_virtual_phi (bb))
841 LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
843 /* All earlier blocks need dependence checking. */
844 check_deps = true;
845 bb = single_pred (bb);
847 while (1);
849 /* We don't allow outer -> inner loop transitions which should have been
850 trapped already during loop form analysis. */
851 gcc_assert (dest_bb->loop_father == loop);
853 /* Check that the destination block we picked has only one pred. To relax this we
854 have to take special care when moving the statements. We don't currently support
855 such control flow however this check is there to simplify how we handle
856 labels that may be present anywhere in the IL. This check is to ensure that the
857 labels aren't significant for the CFG. */
858 if (!single_pred (dest_bb))
859 return opt_result::failure_at (vect_location,
860 "chosen loop exit block (BB %d) does not have a "
861 "single predecessor which is currently not "
862 "supported for early break vectorization.\n",
863 dest_bb->index);
865 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
867 if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
869 /* All uses shall be updated to that of the first load. Entries are
870 stored in reverse order. */
871 tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
872 for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE, vect_location,
876 "will update use: %T, mem_ref: %G", vuse, g);
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location,
882 "recorded statements to be moved to BB %d\n",
883 LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
885 return opt_result::success ();
888 /* Function vect_analyze_data_ref_dependences.
890 Examine all the data references in the loop, and make sure there do not
891 exist any data dependences between them. Set *MAX_VF according to
892 the maximum vectorization factor the data dependences allow. */
894 opt_result
895 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
896 unsigned int *max_vf)
898 unsigned int i;
899 struct data_dependence_relation *ddr;
901 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
903 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
905 LOOP_VINFO_DDRS (loop_vinfo)
906 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
907 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
908 /* We do not need read-read dependences. */
909 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
910 &LOOP_VINFO_DDRS (loop_vinfo),
911 LOOP_VINFO_LOOP_NEST (loop_vinfo),
912 false);
913 gcc_assert (res);
916 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
918 /* For epilogues we either have no aliases or alias versioning
919 was applied to original loop. Therefore we may just get max_vf
920 using VF of original loop. */
921 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
922 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
923 else
924 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
926 opt_result res
927 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
928 if (!res)
929 return res;
932 /* If we have early break statements in the loop, check to see if they
933 are of a form we can vectorizer. */
934 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
935 return vect_analyze_early_break_dependences (loop_vinfo);
937 return opt_result::success ();
941 /* Function vect_slp_analyze_data_ref_dependence.
943 Return TRUE if there (might) exist a dependence between a memory-reference
944 DRA and a memory-reference DRB for VINFO. When versioning for alias
945 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
946 according to the data dependence. */
948 static bool
949 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
950 struct data_dependence_relation *ddr)
952 struct data_reference *dra = DDR_A (ddr);
953 struct data_reference *drb = DDR_B (ddr);
954 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
955 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
957 /* We need to check dependences of statements marked as unvectorizable
958 as well, they still can prohibit vectorization. */
960 /* Independent data accesses. */
961 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
962 return false;
964 if (dra == drb)
965 return false;
967 /* Read-read is OK. */
968 if (DR_IS_READ (dra) && DR_IS_READ (drb))
969 return false;
971 /* If dra and drb are part of the same interleaving chain consider
972 them independent. */
973 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
974 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
975 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
976 return false;
978 /* Unknown data dependence. */
979 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "can't determine dependence between %T and %T\n",
984 DR_REF (dra), DR_REF (drb));
986 else if (dump_enabled_p ())
987 dump_printf_loc (MSG_NOTE, vect_location,
988 "determined dependence between %T and %T\n",
989 DR_REF (dra), DR_REF (drb));
991 return true;
995 /* Analyze dependences involved in the transform of a store SLP NODE. */
997 static bool
998 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
1000 /* This walks over all stmts involved in the SLP store done
1001 in NODE verifying we can sink them up to the last stmt in the
1002 group. */
1003 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
1004 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
1006 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1008 stmt_vec_info access_info
1009 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1010 if (access_info == last_access_info)
1011 continue;
1012 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1013 ao_ref ref;
1014 bool ref_initialized_p = false;
1015 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1016 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
1018 gimple *stmt = gsi_stmt (gsi);
1019 if (! gimple_vuse (stmt))
1020 continue;
1022 /* If we couldn't record a (single) data reference for this
1023 stmt we have to resort to the alias oracle. */
1024 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1025 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1026 if (!dr_b)
1028 /* We are moving a store - this means
1029 we cannot use TBAA for disambiguation. */
1030 if (!ref_initialized_p)
1031 ao_ref_init (&ref, DR_REF (dr_a));
1032 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1033 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1034 return false;
1035 continue;
1038 gcc_assert (!gimple_visited_p (stmt));
1040 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1041 dr_b, vNULL);
1042 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1043 free_dependence_relation (ddr);
1044 if (dependent)
1045 return false;
1048 return true;
1051 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1052 contain the vector of scalar stores of this instance if we are
1053 disambiguating the loads. */
1055 static bool
1056 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1057 vec<stmt_vec_info> stores,
1058 stmt_vec_info last_store_info)
1060 /* This walks over all stmts involved in the SLP load done
1061 in NODE verifying we can hoist them up to the first stmt in the
1062 group. */
1063 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1064 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1066 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1068 if (! SLP_TREE_SCALAR_STMTS (node)[k])
1069 continue;
1070 stmt_vec_info access_info
1071 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1072 if (access_info == first_access_info)
1073 continue;
1074 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1075 ao_ref ref;
1076 bool ref_initialized_p = false;
1077 hash_set<stmt_vec_info> grp_visited;
1078 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1079 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1081 gimple *stmt = gsi_stmt (gsi);
1082 if (! gimple_vdef (stmt))
1083 continue;
1085 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1087 /* If we run into a store of this same instance (we've just
1088 marked those) then delay dependence checking until we run
1089 into the last store because this is where it will have
1090 been sunk to (and we verified that we can do that already). */
1091 if (gimple_visited_p (stmt))
1093 if (stmt_info != last_store_info)
1094 continue;
1096 for (stmt_vec_info &store_info : stores)
1098 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1099 ddr_p ddr = initialize_data_dependence_relation
1100 (dr_a, store_dr, vNULL);
1101 bool dependent
1102 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1103 free_dependence_relation (ddr);
1104 if (dependent)
1105 return false;
1107 continue;
1110 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1112 /* We are hoisting a load - this means we can use TBAA for
1113 disambiguation. */
1114 if (!ref_initialized_p)
1115 ao_ref_init (&ref, DR_REF (dr_a));
1116 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1118 /* If we couldn't record a (single) data reference for this
1119 stmt we have to give up now. */
1120 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1121 if (!dr_b)
1122 return false;
1123 ddr_p ddr = initialize_data_dependence_relation (dr_a,
1124 dr_b, vNULL);
1125 bool dependent
1126 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1127 free_dependence_relation (ddr);
1128 if (dependent)
1129 return false;
1131 /* No dependence. */
1132 return true;
1134 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1136 /* When we run into a store group we have to honor
1137 that earlier stores might be moved here. We don't
1138 know exactly which and where to since we lack a
1139 back-mapping from DR to SLP node, so assume all
1140 earlier stores are sunk here. It's enough to
1141 consider the last stmt of a group for this.
1142 ??? Both this and the fact that we disregard that
1143 the conflicting instance might be removed later
1144 is overly conservative. */
1145 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1146 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1147 store_info != NULL;
1148 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1149 if ((store_info == stmt_info
1150 || get_later_stmt (store_info, stmt_info) == stmt_info)
1151 && !check_hoist (store_info))
1152 return false;
1154 else
1156 if (!check_hoist (stmt_info))
1157 return false;
1161 return true;
1165 /* Function vect_analyze_data_ref_dependences.
1167 Examine all the data references in the basic-block, and make sure there
1168 do not exist any data dependences between them. Set *MAX_VF according to
1169 the maximum vectorization factor the data dependences allow. */
1171 bool
1172 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1174 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1176 /* The stores of this instance are at the root of the SLP tree. */
1177 slp_tree store = NULL;
1178 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1179 store = SLP_INSTANCE_TREE (instance);
1181 /* Verify we can sink stores to the vectorized stmt insert location. */
1182 stmt_vec_info last_store_info = NULL;
1183 if (store)
1185 if (! vect_slp_analyze_store_dependences (vinfo, store))
1186 return false;
1188 /* Mark stores in this instance and remember the last one. */
1189 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1190 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1191 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1194 bool res = true;
1196 /* Verify we can sink loads to the vectorized stmt insert location,
1197 special-casing stores of this instance. */
1198 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1199 if (! vect_slp_analyze_load_dependences (vinfo, load,
1200 store
1201 ? SLP_TREE_SCALAR_STMTS (store)
1202 : vNULL, last_store_info))
1204 res = false;
1205 break;
1208 /* Unset the visited flag. */
1209 if (store)
1210 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1211 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1213 return res;
1216 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1217 applied. */
1220 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1222 HOST_WIDE_INT diff = 0;
1223 /* Alignment is only analyzed for the first element of a DR group,
1224 use that but adjust misalignment by the offset of the access. */
1225 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1227 dr_vec_info *first_dr
1228 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1229 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1230 INTEGER_CSTs and the first element in the group has the lowest
1231 address. */
1232 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1233 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1234 gcc_assert (diff >= 0);
1235 dr_info = first_dr;
1238 int misalign = dr_info->misalignment;
1239 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1240 if (misalign == DR_MISALIGNMENT_UNKNOWN)
1241 return misalign;
1243 /* If the access is only aligned for a vector type with smaller alignment
1244 requirement the access has unknown misalignment. */
1245 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1246 targetm.vectorize.preferred_vector_alignment (vectype)))
1247 return DR_MISALIGNMENT_UNKNOWN;
1249 /* Apply the offset from the DR group start and the externally supplied
1250 offset which can for example result from a negative stride access. */
1251 poly_int64 misalignment = misalign + diff + offset;
1253 /* Below we reject compile-time non-constant target alignments, but if
1254 our misalignment is zero, then we are known to already be aligned
1255 w.r.t. any such possible target alignment. */
1256 if (known_eq (misalignment, 0))
1257 return 0;
1259 unsigned HOST_WIDE_INT target_alignment_c;
1260 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1261 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1262 return DR_MISALIGNMENT_UNKNOWN;
1263 return misalign;
1266 /* Record the base alignment guarantee given by DRB, which occurs
1267 in STMT_INFO. */
1269 static void
1270 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1271 innermost_loop_behavior *drb)
1273 bool existed;
1274 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1275 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1276 if (!existed || entry.second->base_alignment < drb->base_alignment)
1278 entry = std::make_pair (stmt_info, drb);
1279 if (dump_enabled_p ())
1280 dump_printf_loc (MSG_NOTE, vect_location,
1281 "recording new base alignment for %T\n"
1282 " alignment: %d\n"
1283 " misalignment: %d\n"
1284 " based on: %G",
1285 drb->base_address,
1286 drb->base_alignment,
1287 drb->base_misalignment,
1288 stmt_info->stmt);
1292 /* If the region we're going to vectorize is reached, all unconditional
1293 data references occur at least once. We can therefore pool the base
1294 alignment guarantees from each unconditional reference. Do this by
1295 going through all the data references in VINFO and checking whether
1296 the containing statement makes the reference unconditionally. If so,
1297 record the alignment of the base address in VINFO so that it can be
1298 used for all other references with the same base. */
1300 void
1301 vect_record_base_alignments (vec_info *vinfo)
1303 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1304 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1305 for (data_reference *dr : vinfo->shared->datarefs)
1307 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1308 stmt_vec_info stmt_info = dr_info->stmt;
1309 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1310 && STMT_VINFO_VECTORIZABLE (stmt_info)
1311 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1313 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1315 /* If DR is nested in the loop that is being vectorized, we can also
1316 record the alignment of the base wrt the outer loop. */
1317 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1318 vect_record_base_alignment
1319 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1324 /* Function vect_compute_data_ref_alignment
1326 Compute the misalignment of the data reference DR_INFO when vectorizing
1327 with VECTYPE.
1329 RESULT is non-NULL iff VINFO is a loop_vec_info. In that case, *RESULT will
1330 be set appropriately on failure (but is otherwise left unchanged).
1332 Output:
1333 1. initialized misalignment info for DR_INFO
1335 FOR NOW: No analysis is actually performed. Misalignment is calculated
1336 only for trivial cases. TODO. */
1338 static void
1339 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1340 tree vectype, opt_result *result = nullptr)
1342 stmt_vec_info stmt_info = dr_info->stmt;
1343 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1344 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1345 class loop *loop = NULL;
1346 tree ref = DR_REF (dr_info->dr);
1348 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_NOTE, vect_location,
1350 "vect_compute_data_ref_alignment:\n");
1352 if (loop_vinfo)
1353 loop = LOOP_VINFO_LOOP (loop_vinfo);
1355 /* Initialize misalignment to unknown. */
1356 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1358 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1359 return;
1361 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1362 bool step_preserves_misalignment_p;
1364 poly_uint64 vector_alignment
1365 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1366 BITS_PER_UNIT);
1368 /* If this DR needs peeling for alignment for correctness, we must
1369 ensure the target alignment is a constant power-of-two multiple of the
1370 amount read per vector iteration (overriding the above hook where
1371 necessary). */
1372 if (dr_info->need_peeling_for_alignment)
1374 /* Vector size in bytes. */
1375 poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype));
1377 /* We can only peel for loops, of course. */
1378 gcc_checking_assert (loop_vinfo);
1380 /* Calculate the number of vectors read per vector iteration. If
1381 it is a power of two, multiply through to get the required
1382 alignment in bytes. Otherwise, fail analysis since alignment
1383 peeling wouldn't work in such a case. */
1384 poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1385 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1386 num_scalars *= DR_GROUP_SIZE (stmt_info);
1388 auto num_vectors = vect_get_num_vectors (num_scalars, vectype);
1389 if (!pow2p_hwi (num_vectors))
1391 *result = opt_result::failure_at (vect_location,
1392 "non-power-of-two num vectors %u "
1393 "for DR needing peeling for "
1394 "alignment at %G",
1395 num_vectors, stmt_info->stmt);
1396 return;
1399 safe_align *= num_vectors;
1400 if (maybe_gt (safe_align, 4096U))
1402 pretty_printer pp;
1403 pp_wide_integer (&pp, safe_align);
1404 *result = opt_result::failure_at (vect_location,
1405 "alignment required for correctness"
1406 " (%s) may exceed page size",
1407 pp_formatted_text (&pp));
1408 return;
1411 unsigned HOST_WIDE_INT multiple;
1412 if (!constant_multiple_p (vector_alignment, safe_align, &multiple)
1413 || !pow2p_hwi (multiple))
1415 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "forcing alignment for DR from preferred (");
1419 dump_dec (MSG_NOTE, vector_alignment);
1420 dump_printf (MSG_NOTE, ") to safe align (");
1421 dump_dec (MSG_NOTE, safe_align);
1422 dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt);
1424 vector_alignment = safe_align;
1428 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1430 /* If the main loop has peeled for alignment we have no way of knowing
1431 whether the data accesses in the epilogues are aligned. We can't at
1432 compile time answer the question whether we have entered the main loop or
1433 not. Fixes PR 92351. */
1434 if (loop_vinfo)
1436 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1437 if (orig_loop_vinfo
1438 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1439 return;
1442 unsigned HOST_WIDE_INT vect_align_c;
1443 if (!vector_alignment.is_constant (&vect_align_c))
1444 return;
1446 /* No step for BB vectorization. */
1447 if (!loop)
1449 gcc_assert (integer_zerop (drb->step));
1450 step_preserves_misalignment_p = true;
1453 else
1455 /* We can only use base and misalignment information relative to
1456 an innermost loop if the misalignment stays the same throughout the
1457 execution of the loop. As above, this is the case if the stride of
1458 the dataref evenly divides by the alignment. */
1459 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460 step_preserves_misalignment_p
1461 = multiple_p (drb->step_alignment * vf, vect_align_c);
1463 if (!step_preserves_misalignment_p && dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "step doesn't divide the vector alignment.\n");
1467 /* In case the dataref is in an inner-loop of the loop that is being
1468 vectorized (LOOP), we use the base and misalignment information
1469 relative to the outer-loop (LOOP). This is ok only if the
1470 misalignment stays the same throughout the execution of the
1471 inner-loop, which is why we have to check that the stride of the
1472 dataref in the inner-loop evenly divides by the vector alignment. */
1473 if (step_preserves_misalignment_p
1474 && nested_in_vect_loop_p (loop, stmt_info))
1476 step_preserves_misalignment_p
1477 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1479 if (dump_enabled_p ())
1481 if (step_preserves_misalignment_p)
1482 dump_printf_loc (MSG_NOTE, vect_location,
1483 "inner step divides the vector alignment.\n");
1484 else
1485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486 "inner step doesn't divide the vector"
1487 " alignment.\n");
1492 unsigned int base_alignment = drb->base_alignment;
1493 unsigned int base_misalignment = drb->base_misalignment;
1495 /* Calculate the maximum of the pooled base address alignment and the
1496 alignment that we can compute for DR itself. */
1497 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1498 = base_alignments->get (drb->base_address);
1499 if (entry
1500 && base_alignment < (*entry).second->base_alignment
1501 && (loop_vinfo
1502 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1503 gimple_bb (entry->first->stmt))
1504 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1505 || (entry->first->dr_aux.group <= dr_info->group)))))
1507 base_alignment = entry->second->base_alignment;
1508 base_misalignment = entry->second->base_misalignment;
1511 if (drb->offset_alignment < vect_align_c
1512 || !step_preserves_misalignment_p
1513 /* We need to know whether the step wrt the vectorized loop is
1514 negative when computing the starting misalignment below. */
1515 || TREE_CODE (drb->step) != INTEGER_CST)
1517 if (dump_enabled_p ())
1518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 "Unknown alignment for access: %T\n", ref);
1520 return;
1523 if (base_alignment < vect_align_c)
1525 unsigned int max_alignment;
1526 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1527 if (max_alignment < vect_align_c
1528 || !vect_can_force_dr_alignment_p (base,
1529 vect_align_c * BITS_PER_UNIT))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_NOTE, vect_location,
1533 "can't force alignment of ref: %T\n", ref);
1534 return;
1537 /* Force the alignment of the decl.
1538 NOTE: This is the only change to the code we make during
1539 the analysis phase, before deciding to vectorize the loop. */
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location,
1542 "force alignment of %T\n", ref);
1544 dr_info->base_decl = base;
1545 dr_info->base_misaligned = true;
1546 base_misalignment = 0;
1548 poly_int64 misalignment
1549 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1551 unsigned int const_misalignment;
1552 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "Non-constant misalignment for access: %T\n", ref);
1557 return;
1560 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1564 "misalign = %d bytes of ref %T\n",
1565 const_misalignment, ref);
1567 return;
1570 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1571 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1572 is made aligned via peeling. */
1574 static bool
1575 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1576 dr_vec_info *dr_peel_info)
1578 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1579 DR_TARGET_ALIGNMENT (dr_info)))
1581 poly_offset_int diff
1582 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1583 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1584 if (known_eq (diff, 0)
1585 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1586 return true;
1588 return false;
1591 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1592 aligned via peeling. */
1594 static bool
1595 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1596 dr_vec_info *dr_peel_info)
1598 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1599 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1600 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1601 DR_OFFSET (dr_peel_info->dr), 0)
1602 || !operand_equal_p (DR_STEP (dr_info->dr),
1603 DR_STEP (dr_peel_info->dr), 0))
1604 return false;
1606 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1609 /* Compute the value for dr_info->misalign so that the access appears
1610 aligned. This is used by peeling to compensate for dr_misalignment
1611 applying the offset for negative step. */
1614 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1616 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1617 return 0;
1619 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1620 poly_int64 misalignment
1621 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1622 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1624 unsigned HOST_WIDE_INT target_alignment_c;
1625 int misalign;
1626 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1627 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1628 return DR_MISALIGNMENT_UNKNOWN;
1629 return misalign;
1632 /* Function vect_update_misalignment_for_peel.
1633 Sets DR_INFO's misalignment
1634 - to 0 if it has the same alignment as DR_PEEL_INFO,
1635 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1636 - to -1 (unknown) otherwise.
1638 DR_INFO - the data reference whose misalignment is to be adjusted.
1639 DR_PEEL_INFO - the data reference whose misalignment is being made
1640 zero in the vector loop by the peel.
1641 NPEEL - the number of iterations in the peel loop if the misalignment
1642 of DR_PEEL_INFO is known at compile time. */
1644 static void
1645 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1646 dr_vec_info *dr_peel_info, int npeel)
1648 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1649 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1651 SET_DR_MISALIGNMENT (dr_info,
1652 vect_dr_misalign_for_aligned_access (dr_peel_info));
1653 return;
1656 unsigned HOST_WIDE_INT alignment;
1657 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1658 && known_alignment_for_access_p (dr_info,
1659 STMT_VINFO_VECTYPE (dr_info->stmt))
1660 && known_alignment_for_access_p (dr_peel_info,
1661 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1663 int misal = dr_info->misalignment;
1664 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1665 misal &= alignment - 1;
1666 set_dr_misalignment (dr_info, misal);
1667 return;
1670 if (dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1672 "to unknown (-1).\n");
1673 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1676 /* Return true if alignment is relevant for DR_INFO. */
1678 static bool
1679 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1681 stmt_vec_info stmt_info = dr_info->stmt;
1683 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1684 return false;
1686 /* For interleaving, only the alignment of the first access matters. */
1687 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1688 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1689 return false;
1691 /* Scatter-gather and invariant accesses continue to address individual
1692 scalars, so vector-level alignment is irrelevant. */
1693 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1694 || integer_zerop (DR_STEP (dr_info->dr)))
1695 return false;
1697 /* Strided accesses perform only component accesses, alignment is
1698 irrelevant for them. */
1699 if (STMT_VINFO_STRIDED_P (stmt_info)
1700 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1701 return false;
1703 return true;
1706 /* Given an memory reference EXP return whether its alignment is less
1707 than its size. */
1709 static bool
1710 not_size_aligned (tree exp)
1712 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1713 return true;
1715 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1716 > get_object_alignment (exp));
1719 /* Function vector_alignment_reachable_p
1721 Return true if vector alignment for DR_INFO is reachable by peeling
1722 a few loop iterations. Return false otherwise. */
1724 static bool
1725 vector_alignment_reachable_p (dr_vec_info *dr_info)
1727 stmt_vec_info stmt_info = dr_info->stmt;
1728 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1730 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1732 /* For interleaved access we peel only if number of iterations in
1733 the prolog loop ({VF - misalignment}), is a multiple of the
1734 number of the interleaved accesses. */
1735 int elem_size, mis_in_elements;
1737 /* FORNOW: handle only known alignment. */
1738 if (!known_alignment_for_access_p (dr_info, vectype))
1739 return false;
1741 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1742 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1743 elem_size = vector_element_size (vector_size, nelements);
1744 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1746 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1747 return false;
1750 /* If misalignment is known at the compile time then allow peeling
1751 only if natural alignment is reachable through peeling. */
1752 if (known_alignment_for_access_p (dr_info, vectype)
1753 && !aligned_access_p (dr_info, vectype))
1755 HOST_WIDE_INT elmsize =
1756 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1757 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_NOTE, vect_location,
1760 "data size = %wd. misalignment = %d.\n", elmsize,
1761 dr_misalignment (dr_info, vectype));
1763 if (dr_misalignment (dr_info, vectype) % elmsize)
1765 if (dump_enabled_p ())
1766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1767 "data size does not divide the misalignment.\n");
1768 return false;
1772 if (!known_alignment_for_access_p (dr_info, vectype))
1774 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1775 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1776 if (dump_enabled_p ())
1777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778 "Unknown misalignment, %snaturally aligned\n",
1779 is_packed ? "not " : "");
1780 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1783 return true;
1787 /* Calculate the cost of the memory access represented by DR_INFO. */
1789 static void
1790 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1791 dr_alignment_support alignment_support_scheme,
1792 int misalignment,
1793 unsigned int *inside_cost,
1794 unsigned int *outside_cost,
1795 stmt_vector_for_cost *body_cost_vec,
1796 stmt_vector_for_cost *prologue_cost_vec)
1798 stmt_vec_info stmt_info = dr_info->stmt;
1799 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1800 int ncopies;
1802 if (PURE_SLP_STMT (stmt_info))
1803 ncopies = 1;
1804 else
1805 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1807 if (DR_IS_READ (dr_info->dr))
1808 vect_get_load_cost (vinfo, stmt_info, NULL, ncopies,
1809 alignment_support_scheme, misalignment, true,
1810 inside_cost, outside_cost, prologue_cost_vec,
1811 body_cost_vec, false);
1812 else
1813 vect_get_store_cost (vinfo,stmt_info, NULL, ncopies,
1814 alignment_support_scheme, misalignment, inside_cost,
1815 body_cost_vec);
1817 if (dump_enabled_p ())
1818 dump_printf_loc (MSG_NOTE, vect_location,
1819 "vect_get_data_access_cost: inside_cost = %d, "
1820 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1824 typedef struct _vect_peel_info
1826 dr_vec_info *dr_info;
1827 int npeel;
1828 unsigned int count;
1829 } *vect_peel_info;
1831 typedef struct _vect_peel_extended_info
1833 vec_info *vinfo;
1834 struct _vect_peel_info peel_info;
1835 unsigned int inside_cost;
1836 unsigned int outside_cost;
1837 } *vect_peel_extended_info;
1840 /* Peeling hashtable helpers. */
1842 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1844 static inline hashval_t hash (const _vect_peel_info *);
1845 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1848 inline hashval_t
1849 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1851 return (hashval_t) peel_info->npeel;
1854 inline bool
1855 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1857 return (a->npeel == b->npeel);
1861 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1863 static void
1864 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1865 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1866 int npeel, bool supportable_if_not_aligned)
1868 struct _vect_peel_info elem, *slot;
1869 _vect_peel_info **new_slot;
1871 elem.npeel = npeel;
1872 slot = peeling_htab->find (&elem);
1873 if (slot)
1874 slot->count++;
1875 else
1877 slot = XNEW (struct _vect_peel_info);
1878 slot->npeel = npeel;
1879 slot->dr_info = dr_info;
1880 slot->count = 1;
1881 new_slot = peeling_htab->find_slot (slot, INSERT);
1882 *new_slot = slot;
1885 /* If this DR is not supported with unknown misalignment then bias
1886 this slot when the cost model is disabled. */
1887 if (!supportable_if_not_aligned
1888 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1889 slot->count += VECT_MAX_COST;
1893 /* Traverse peeling hash table to find peeling option that aligns maximum
1894 number of data accesses. */
1897 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1898 _vect_peel_extended_info *max)
1900 vect_peel_info elem = *slot;
1902 if (elem->count > max->peel_info.count
1903 || (elem->count == max->peel_info.count
1904 && max->peel_info.npeel > elem->npeel))
1906 max->peel_info.npeel = elem->npeel;
1907 max->peel_info.count = elem->count;
1908 max->peel_info.dr_info = elem->dr_info;
1911 return 1;
1914 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1915 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1916 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1917 after peeling. */
1919 static void
1920 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1921 dr_vec_info *dr0_info,
1922 unsigned int *inside_cost,
1923 unsigned int *outside_cost,
1924 stmt_vector_for_cost *body_cost_vec,
1925 stmt_vector_for_cost *prologue_cost_vec,
1926 unsigned int npeel)
1928 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1930 bool dr0_alignment_known_p
1931 = (dr0_info
1932 && known_alignment_for_access_p (dr0_info,
1933 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1935 for (data_reference *dr : datarefs)
1937 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1938 if (!vect_relevant_for_alignment_p (dr_info))
1939 continue;
1941 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1942 dr_alignment_support alignment_support_scheme;
1943 int misalignment;
1944 unsigned HOST_WIDE_INT alignment;
1946 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1947 size_zero_node) < 0;
1948 poly_int64 off = 0;
1949 if (negative)
1950 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1951 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1953 if (npeel == 0)
1954 misalignment = dr_misalignment (dr_info, vectype, off);
1955 else if (dr_info == dr0_info
1956 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1957 misalignment = 0;
1958 else if (!dr0_alignment_known_p
1959 || !known_alignment_for_access_p (dr_info, vectype)
1960 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1961 misalignment = DR_MISALIGNMENT_UNKNOWN;
1962 else
1964 misalignment = dr_misalignment (dr_info, vectype, off);
1965 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1966 misalignment &= alignment - 1;
1968 alignment_support_scheme
1969 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1970 misalignment);
1972 vect_get_data_access_cost (loop_vinfo, dr_info,
1973 alignment_support_scheme, misalignment,
1974 inside_cost, outside_cost,
1975 body_cost_vec, prologue_cost_vec);
1979 /* Traverse peeling hash table and calculate cost for each peeling option.
1980 Find the one with the lowest cost. */
1983 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1984 _vect_peel_extended_info *min)
1986 vect_peel_info elem = *slot;
1987 int dummy;
1988 unsigned int inside_cost = 0, outside_cost = 0;
1989 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1990 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1991 epilogue_cost_vec;
1993 prologue_cost_vec.create (2);
1994 body_cost_vec.create (2);
1995 epilogue_cost_vec.create (2);
1997 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1998 &outside_cost, &body_cost_vec,
1999 &prologue_cost_vec, elem->npeel);
2001 body_cost_vec.release ();
2003 outside_cost += vect_get_known_peeling_cost
2004 (loop_vinfo, elem->npeel, &dummy,
2005 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006 &prologue_cost_vec, &epilogue_cost_vec);
2008 /* Prologue and epilogue costs are added to the target model later.
2009 These costs depend only on the scalar iteration cost, the
2010 number of peeling iterations finally chosen, and the number of
2011 misaligned statements. So discard the information found here. */
2012 prologue_cost_vec.release ();
2013 epilogue_cost_vec.release ();
2015 if (inside_cost < min->inside_cost
2016 || (inside_cost == min->inside_cost
2017 && outside_cost < min->outside_cost))
2019 min->inside_cost = inside_cost;
2020 min->outside_cost = outside_cost;
2021 min->peel_info.dr_info = elem->dr_info;
2022 min->peel_info.npeel = elem->npeel;
2023 min->peel_info.count = elem->count;
2026 return 1;
2030 /* Choose best peeling option by traversing peeling hash table and either
2031 choosing an option with the lowest cost (if cost model is enabled) or the
2032 option that aligns as many accesses as possible. */
2034 static struct _vect_peel_extended_info
2035 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
2036 loop_vec_info loop_vinfo)
2038 struct _vect_peel_extended_info res;
2040 res.peel_info.dr_info = NULL;
2041 res.vinfo = loop_vinfo;
2043 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2045 res.inside_cost = INT_MAX;
2046 res.outside_cost = INT_MAX;
2047 peeling_htab->traverse <_vect_peel_extended_info *,
2048 vect_peeling_hash_get_lowest_cost> (&res);
2050 else
2052 res.peel_info.count = 0;
2053 peeling_htab->traverse <_vect_peel_extended_info *,
2054 vect_peeling_hash_get_most_frequent> (&res);
2055 res.inside_cost = 0;
2056 res.outside_cost = 0;
2059 return res;
2062 /* Return true if the new peeling NPEEL is supported. */
2064 static bool
2065 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
2066 unsigned npeel)
2068 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2069 enum dr_alignment_support supportable_dr_alignment;
2071 bool dr0_alignment_known_p
2072 = known_alignment_for_access_p (dr0_info,
2073 STMT_VINFO_VECTYPE (dr0_info->stmt));
2075 /* Ensure that all data refs can be vectorized after the peel. */
2076 for (data_reference *dr : datarefs)
2078 if (dr == dr0_info->dr)
2079 continue;
2081 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2082 if (!vect_relevant_for_alignment_p (dr_info)
2083 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
2084 continue;
2086 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2087 int misalignment;
2088 unsigned HOST_WIDE_INT alignment;
2089 if (!dr0_alignment_known_p
2090 || !known_alignment_for_access_p (dr_info, vectype)
2091 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2092 misalignment = DR_MISALIGNMENT_UNKNOWN;
2093 else
2095 misalignment = dr_misalignment (dr_info, vectype);
2096 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2097 misalignment &= alignment - 1;
2099 supportable_dr_alignment
2100 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2101 misalignment);
2102 if (supportable_dr_alignment == dr_unaligned_unsupported)
2103 return false;
2106 return true;
2109 /* Compare two data-references DRA and DRB to group them into chunks
2110 with related alignment. */
2112 static int
2113 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2115 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2116 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2117 int cmp;
2119 /* Stabilize sort. */
2120 if (dra == drb)
2121 return 0;
2123 /* Ordering of DRs according to base. */
2124 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2125 DR_BASE_ADDRESS (drb));
2126 if (cmp != 0)
2127 return cmp;
2129 /* And according to DR_OFFSET. */
2130 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2131 if (cmp != 0)
2132 return cmp;
2134 /* And after step. */
2135 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2136 if (cmp != 0)
2137 return cmp;
2139 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2140 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2141 if (cmp == 0)
2142 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2143 return cmp;
2146 /* Function vect_enhance_data_refs_alignment
2148 This pass will use loop versioning and loop peeling in order to enhance
2149 the alignment of data references in the loop.
2151 FOR NOW: we assume that whatever versioning/peeling takes place, only the
2152 original loop is to be vectorized. Any other loops that are created by
2153 the transformations performed in this pass - are not supposed to be
2154 vectorized. This restriction will be relaxed.
2156 This pass will require a cost model to guide it whether to apply peeling
2157 or versioning or a combination of the two. For example, the scheme that
2158 intel uses when given a loop with several memory accesses, is as follows:
2159 choose one memory access ('p') which alignment you want to force by doing
2160 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2161 other accesses are not necessarily aligned, or (2) use loop versioning to
2162 generate one loop in which all accesses are aligned, and another loop in
2163 which only 'p' is necessarily aligned.
2165 ("Automatic Intra-Register Vectorization for the Intel Architecture",
2166 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2167 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2169 Devising a cost model is the most critical aspect of this work. It will
2170 guide us on which access to peel for, whether to use loop versioning, how
2171 many versions to create, etc. The cost model will probably consist of
2172 generic considerations as well as target specific considerations (on
2173 powerpc for example, misaligned stores are more painful than misaligned
2174 loads).
2176 Here are the general steps involved in alignment enhancements:
2178 -- original loop, before alignment analysis:
2179 for (i=0; i<N; i++){
2180 x = q[i]; # DR_MISALIGNMENT(q) = unknown
2181 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2184 -- After vect_compute_data_refs_alignment:
2185 for (i=0; i<N; i++){
2186 x = q[i]; # DR_MISALIGNMENT(q) = 3
2187 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2190 -- Possibility 1: we do loop versioning:
2191 if (p is aligned) {
2192 for (i=0; i<N; i++){ # loop 1A
2193 x = q[i]; # DR_MISALIGNMENT(q) = 3
2194 p[i] = y; # DR_MISALIGNMENT(p) = 0
2197 else {
2198 for (i=0; i<N; i++){ # loop 1B
2199 x = q[i]; # DR_MISALIGNMENT(q) = 3
2200 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2204 -- Possibility 2: we do loop peeling:
2205 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2206 x = q[i];
2207 p[i] = y;
2209 for (i = 3; i < N; i++){ # loop 2A
2210 x = q[i]; # DR_MISALIGNMENT(q) = 0
2211 p[i] = y; # DR_MISALIGNMENT(p) = unknown
2214 -- Possibility 3: combination of loop peeling and versioning:
2215 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2216 x = q[i];
2217 p[i] = y;
2219 if (p is aligned) {
2220 for (i = 3; i<N; i++){ # loop 3A
2221 x = q[i]; # DR_MISALIGNMENT(q) = 0
2222 p[i] = y; # DR_MISALIGNMENT(p) = 0
2225 else {
2226 for (i = 3; i<N; i++){ # loop 3B
2227 x = q[i]; # DR_MISALIGNMENT(q) = 0
2228 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2232 These loops are later passed to loop_transform to be vectorized. The
2233 vectorizer will use the alignment information to guide the transformation
2234 (whether to generate regular loads/stores, or with special handling for
2235 misalignment). */
2237 opt_result
2238 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2240 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2241 dr_vec_info *first_store = NULL;
2242 dr_vec_info *dr0_info = NULL;
2243 struct data_reference *dr;
2244 unsigned int i;
2245 bool do_peeling = false;
2246 bool do_versioning = false;
2247 unsigned int npeel = 0;
2248 bool one_misalignment_known = false;
2249 bool one_misalignment_unknown = false;
2250 bool one_dr_unsupportable = false;
2251 dr_vec_info *unsupportable_dr_info = NULL;
2252 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2253 hash_table<peel_info_hasher> peeling_htab (1);
2255 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2257 /* Reset data so we can safely be called multiple times. */
2258 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2259 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2261 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2262 return opt_result::success ();
2264 /* Sort the vector of datarefs so DRs that have the same or dependent
2265 alignment are next to each other. */
2266 auto_vec<data_reference_p> datarefs
2267 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2268 datarefs.qsort (dr_align_group_sort_cmp);
2270 /* Compute the number of DRs that become aligned when we peel
2271 a dataref so it becomes aligned. */
2272 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2273 n_same_align_refs.quick_grow_cleared (datarefs.length ());
2274 unsigned i0;
2275 for (i0 = 0; i0 < datarefs.length (); ++i0)
2276 if (DR_BASE_ADDRESS (datarefs[i0]))
2277 break;
2278 for (i = i0 + 1; i <= datarefs.length (); ++i)
2280 if (i == datarefs.length ()
2281 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2282 DR_BASE_ADDRESS (datarefs[i]), 0)
2283 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2284 DR_OFFSET (datarefs[i]), 0)
2285 || !operand_equal_p (DR_STEP (datarefs[i0]),
2286 DR_STEP (datarefs[i]), 0))
2288 /* The subgroup [i0, i-1] now only differs in DR_INIT and
2289 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2290 will get known misalignment if we align one of the refs
2291 with the largest DR_TARGET_ALIGNMENT. */
2292 for (unsigned j = i0; j < i; ++j)
2294 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2295 for (unsigned k = i0; k < i; ++k)
2297 if (k == j)
2298 continue;
2299 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2300 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2301 dr_infoj))
2302 n_same_align_refs[j]++;
2305 i0 = i;
2309 /* While cost model enhancements are expected in the future, the high level
2310 view of the code at this time is as follows:
2312 A) If there is a misaligned access then see if peeling to align
2313 this access can make all data references satisfy
2314 vect_supportable_dr_alignment. If so, update data structures
2315 as needed and return true.
2317 B) If peeling wasn't possible and there is a data reference with an
2318 unknown misalignment that does not satisfy vect_supportable_dr_alignment
2319 then see if loop versioning checks can be used to make all data
2320 references satisfy vect_supportable_dr_alignment. If so, update
2321 data structures as needed and return true.
2323 C) If neither peeling nor versioning were successful then return false if
2324 any data reference does not satisfy vect_supportable_dr_alignment.
2326 D) Return true (all data references satisfy vect_supportable_dr_alignment).
2328 Note, Possibility 3 above (which is peeling and versioning together) is not
2329 being done at this time. */
2331 /* (1) Peeling to force alignment. */
2333 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2334 Considerations:
2335 + How many accesses will become aligned due to the peeling
2336 - How many accesses will become unaligned due to the peeling,
2337 and the cost of misaligned accesses.
2338 - The cost of peeling (the extra runtime checks, the increase
2339 in code size). */
2341 FOR_EACH_VEC_ELT (datarefs, i, dr)
2343 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2344 if (!vect_relevant_for_alignment_p (dr_info))
2345 continue;
2347 stmt_vec_info stmt_info = dr_info->stmt;
2348 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2349 do_peeling = vector_alignment_reachable_p (dr_info);
2350 if (do_peeling)
2352 if (known_alignment_for_access_p (dr_info, vectype))
2354 unsigned int npeel_tmp = 0;
2355 bool negative = tree_int_cst_compare (DR_STEP (dr),
2356 size_zero_node) < 0;
2358 /* If known_alignment_for_access_p then we have set
2359 DR_MISALIGNMENT which is only done if we know it at compiler
2360 time, so it is safe to assume target alignment is constant.
2362 unsigned int target_align =
2363 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2364 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2365 poly_int64 off = 0;
2366 if (negative)
2367 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2368 unsigned int mis = dr_misalignment (dr_info, vectype, off);
2369 mis = negative ? mis : -mis;
2370 if (mis != 0)
2371 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2373 /* For multiple types, it is possible that the bigger type access
2374 will have more than one peeling option. E.g., a loop with two
2375 types: one of size (vector size / 4), and the other one of
2376 size (vector size / 8). Vectorization factor will 8. If both
2377 accesses are misaligned by 3, the first one needs one scalar
2378 iteration to be aligned, and the second one needs 5. But the
2379 first one will be aligned also by peeling 5 scalar
2380 iterations, and in that case both accesses will be aligned.
2381 Hence, except for the immediate peeling amount, we also want
2382 to try to add full vector size, while we don't exceed
2383 vectorization factor.
2384 We do this automatically for cost model, since we calculate
2385 cost for every peeling option. */
2386 poly_uint64 nscalars = npeel_tmp;
2387 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2389 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2390 unsigned group_size = 1;
2391 if (STMT_SLP_TYPE (stmt_info)
2392 && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2393 group_size = DR_GROUP_SIZE (stmt_info);
2394 nscalars = vf * group_size;
2397 /* Save info about DR in the hash table. Also include peeling
2398 amounts according to the explanation above. Indicate
2399 the alignment status when the ref is not aligned.
2400 ??? Rather than using unknown alignment here we should
2401 prune all entries from the peeling hashtable which cause
2402 DRs to be not supported. */
2403 bool supportable_if_not_aligned
2404 = vect_supportable_dr_alignment
2405 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2406 while (known_le (npeel_tmp, nscalars))
2408 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2409 dr_info, npeel_tmp,
2410 supportable_if_not_aligned);
2411 npeel_tmp += MAX (1, target_align / dr_size);
2414 one_misalignment_known = true;
2416 else
2418 /* If we don't know any misalignment values, we prefer
2419 peeling for data-ref that has the maximum number of data-refs
2420 with the same alignment, unless the target prefers to align
2421 stores over load. */
2422 unsigned same_align_drs = n_same_align_refs[i];
2423 if (!dr0_info
2424 || dr0_same_align_drs < same_align_drs)
2426 dr0_same_align_drs = same_align_drs;
2427 dr0_info = dr_info;
2429 /* For data-refs with the same number of related
2430 accesses prefer the one where the misalign
2431 computation will be invariant in the outermost loop. */
2432 else if (dr0_same_align_drs == same_align_drs)
2434 class loop *ivloop0, *ivloop;
2435 ivloop0 = outermost_invariant_loop_for_expr
2436 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2437 ivloop = outermost_invariant_loop_for_expr
2438 (loop, DR_BASE_ADDRESS (dr));
2439 if ((ivloop && !ivloop0)
2440 || (ivloop && ivloop0
2441 && flow_loop_nested_p (ivloop, ivloop0)))
2442 dr0_info = dr_info;
2445 one_misalignment_unknown = true;
2447 /* Check for data refs with unsupportable alignment that
2448 can be peeled. */
2449 enum dr_alignment_support supportable_dr_alignment
2450 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2451 DR_MISALIGNMENT_UNKNOWN);
2452 if (supportable_dr_alignment == dr_unaligned_unsupported)
2454 one_dr_unsupportable = true;
2455 unsupportable_dr_info = dr_info;
2458 if (!first_store && DR_IS_WRITE (dr))
2460 first_store = dr_info;
2461 first_store_same_align_drs = same_align_drs;
2465 else
2467 if (!aligned_access_p (dr_info, vectype))
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "vector alignment may not be reachable\n");
2472 break;
2477 /* Check if we can possibly peel the loop. */
2478 if (!vect_can_advance_ivs_p (loop_vinfo)
2479 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2480 loop_preheader_edge (loop))
2481 || loop->inner
2482 || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2483 do_peeling = false;
2485 struct _vect_peel_extended_info peel_for_known_alignment;
2486 struct _vect_peel_extended_info peel_for_unknown_alignment;
2487 struct _vect_peel_extended_info best_peel;
2489 peel_for_unknown_alignment.inside_cost = INT_MAX;
2490 peel_for_unknown_alignment.outside_cost = INT_MAX;
2491 peel_for_unknown_alignment.peel_info.count = 0;
2493 if (do_peeling
2494 && one_misalignment_unknown)
2496 /* Check if the target requires to prefer stores over loads, i.e., if
2497 misaligned stores are more expensive than misaligned loads (taking
2498 drs with same alignment into account). */
2499 unsigned int load_inside_cost = 0;
2500 unsigned int load_outside_cost = 0;
2501 unsigned int store_inside_cost = 0;
2502 unsigned int store_outside_cost = 0;
2503 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2505 stmt_vector_for_cost dummy;
2506 dummy.create (2);
2507 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2508 &load_inside_cost,
2509 &load_outside_cost,
2510 &dummy, &dummy, estimated_npeels);
2511 dummy.release ();
2513 if (first_store)
2515 dummy.create (2);
2516 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2517 &store_inside_cost,
2518 &store_outside_cost,
2519 &dummy, &dummy,
2520 estimated_npeels);
2521 dummy.release ();
2523 else
2525 store_inside_cost = INT_MAX;
2526 store_outside_cost = INT_MAX;
2529 if (load_inside_cost > store_inside_cost
2530 || (load_inside_cost == store_inside_cost
2531 && load_outside_cost > store_outside_cost))
2533 dr0_info = first_store;
2534 dr0_same_align_drs = first_store_same_align_drs;
2535 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2536 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2538 else
2540 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2541 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2544 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2545 prologue_cost_vec.create (2);
2546 epilogue_cost_vec.create (2);
2548 int dummy2;
2549 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2550 (loop_vinfo, estimated_npeels, &dummy2,
2551 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2552 &prologue_cost_vec, &epilogue_cost_vec);
2554 prologue_cost_vec.release ();
2555 epilogue_cost_vec.release ();
2557 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2560 peel_for_unknown_alignment.peel_info.npeel = 0;
2561 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2563 best_peel = peel_for_unknown_alignment;
2565 peel_for_known_alignment.inside_cost = INT_MAX;
2566 peel_for_known_alignment.outside_cost = INT_MAX;
2567 peel_for_known_alignment.peel_info.count = 0;
2568 peel_for_known_alignment.peel_info.dr_info = NULL;
2570 if (do_peeling && one_misalignment_known)
2572 /* Peeling is possible, but there is no data access that is not supported
2573 unless aligned. So we try to choose the best possible peeling from
2574 the hash table. */
2575 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2576 (&peeling_htab, loop_vinfo);
2579 /* Compare costs of peeling for known and unknown alignment. */
2580 if (peel_for_known_alignment.peel_info.dr_info != NULL
2581 && peel_for_unknown_alignment.inside_cost
2582 >= peel_for_known_alignment.inside_cost)
2584 best_peel = peel_for_known_alignment;
2586 /* If the best peeling for known alignment has NPEEL == 0, perform no
2587 peeling at all except if there is an unsupportable dr that we can
2588 align. */
2589 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2590 do_peeling = false;
2593 /* If there is an unsupportable data ref, prefer this over all choices so far
2594 since we'd have to discard a chosen peeling except when it accidentally
2595 aligned the unsupportable data ref. */
2596 if (one_dr_unsupportable)
2597 dr0_info = unsupportable_dr_info;
2598 else if (do_peeling)
2600 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2601 TODO: Use nopeel_outside_cost or get rid of it? */
2602 unsigned nopeel_inside_cost = 0;
2603 unsigned nopeel_outside_cost = 0;
2605 stmt_vector_for_cost dummy;
2606 dummy.create (2);
2607 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2608 &nopeel_outside_cost, &dummy, &dummy, 0);
2609 dummy.release ();
2611 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2612 costs will be recorded. */
2613 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2614 prologue_cost_vec.create (2);
2615 epilogue_cost_vec.create (2);
2617 int dummy2;
2618 nopeel_outside_cost += vect_get_known_peeling_cost
2619 (loop_vinfo, 0, &dummy2,
2620 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2621 &prologue_cost_vec, &epilogue_cost_vec);
2623 prologue_cost_vec.release ();
2624 epilogue_cost_vec.release ();
2626 npeel = best_peel.peel_info.npeel;
2627 dr0_info = best_peel.peel_info.dr_info;
2629 /* If no peeling is not more expensive than the best peeling we
2630 have so far, don't perform any peeling. */
2631 if (nopeel_inside_cost <= best_peel.inside_cost)
2632 do_peeling = false;
2635 if (do_peeling)
2637 stmt_vec_info stmt_info = dr0_info->stmt;
2638 if (known_alignment_for_access_p (dr0_info,
2639 STMT_VINFO_VECTYPE (stmt_info)))
2641 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2642 size_zero_node) < 0;
2643 if (!npeel)
2645 /* Since it's known at compile time, compute the number of
2646 iterations in the peeled loop (the peeling factor) for use in
2647 updating DR_MISALIGNMENT values. The peeling factor is the
2648 vectorization factor minus the misalignment as an element
2649 count. */
2650 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2651 poly_int64 off = 0;
2652 if (negative)
2653 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2654 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2655 unsigned int mis
2656 = dr_misalignment (dr0_info, vectype, off);
2657 mis = negative ? mis : -mis;
2658 /* If known_alignment_for_access_p then we have set
2659 DR_MISALIGNMENT which is only done if we know it at compiler
2660 time, so it is safe to assume target alignment is constant.
2662 unsigned int target_align =
2663 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2664 npeel = ((mis & (target_align - 1))
2665 / vect_get_scalar_dr_size (dr0_info));
2668 /* For interleaved data access every iteration accesses all the
2669 members of the group, therefore we divide the number of iterations
2670 by the group size. */
2671 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2672 npeel /= DR_GROUP_SIZE (stmt_info);
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "Try peeling by %d\n", npeel);
2679 /* Ensure that all datarefs can be vectorized after the peel. */
2680 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2681 do_peeling = false;
2683 /* Check if all datarefs are supportable and log. */
2684 if (do_peeling
2685 && npeel == 0
2686 && known_alignment_for_access_p (dr0_info,
2687 STMT_VINFO_VECTYPE (stmt_info)))
2688 return opt_result::success ();
2690 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2691 if (do_peeling)
2693 unsigned max_allowed_peel
2694 = param_vect_max_peeling_for_alignment;
2695 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2696 max_allowed_peel = 0;
2697 if (max_allowed_peel != (unsigned)-1)
2699 unsigned max_peel = npeel;
2700 if (max_peel == 0)
2702 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2703 unsigned HOST_WIDE_INT target_align_c;
2704 if (target_align.is_constant (&target_align_c))
2705 max_peel =
2706 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2707 else
2709 do_peeling = false;
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE, vect_location,
2712 "Disable peeling, max peels set and vector"
2713 " alignment unknown\n");
2716 if (max_peel > max_allowed_peel)
2718 do_peeling = false;
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "Disable peeling, max peels reached: %d\n", max_peel);
2726 /* Cost model #2 - if peeling may result in a remaining loop not
2727 iterating enough to be vectorized then do not peel. Since this
2728 is a cost heuristic rather than a correctness decision, use the
2729 most likely runtime value for variable vectorization factors. */
2730 if (do_peeling
2731 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2733 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2734 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2735 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2736 < assumed_vf + max_peel)
2737 do_peeling = false;
2740 if (do_peeling)
2742 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2743 If the misalignment of DR_i is identical to that of dr0 then set
2744 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2745 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2746 by the peeling factor times the element size of DR_i (MOD the
2747 vectorization factor times the size). Otherwise, the
2748 misalignment of DR_i must be set to unknown. */
2749 FOR_EACH_VEC_ELT (datarefs, i, dr)
2750 if (dr != dr0_info->dr)
2752 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2753 if (!vect_relevant_for_alignment_p (dr_info))
2754 continue;
2756 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2759 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2760 if (npeel)
2761 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2762 else
2763 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2764 SET_DR_MISALIGNMENT (dr0_info,
2765 vect_dr_misalign_for_aligned_access (dr0_info));
2766 if (dump_enabled_p ())
2768 dump_printf_loc (MSG_NOTE, vect_location,
2769 "Alignment of access forced using peeling.\n");
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "Peeling for alignment will be applied.\n");
2774 /* The inside-loop cost will be accounted for in vectorizable_load
2775 and vectorizable_store correctly with adjusted alignments.
2776 Drop the body_cst_vec on the floor here. */
2777 return opt_result::success ();
2781 /* (2) Versioning to force alignment. */
2783 /* Try versioning if:
2784 1) optimize loop for speed and the cost-model is not cheap
2785 2) there is at least one unsupported misaligned data ref with an unknown
2786 misalignment, and
2787 3) all misaligned data refs with a known misalignment are supported, and
2788 4) the number of runtime alignment checks is within reason. */
2790 do_versioning
2791 = (optimize_loop_nest_for_speed_p (loop)
2792 && !loop->inner /* FORNOW */
2793 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2795 if (do_versioning)
2797 FOR_EACH_VEC_ELT (datarefs, i, dr)
2799 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2800 if (!vect_relevant_for_alignment_p (dr_info))
2801 continue;
2803 stmt_vec_info stmt_info = dr_info->stmt;
2804 if (STMT_VINFO_STRIDED_P (stmt_info))
2806 do_versioning = false;
2807 break;
2810 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2811 bool negative = tree_int_cst_compare (DR_STEP (dr),
2812 size_zero_node) < 0;
2813 poly_int64 off = 0;
2814 if (negative)
2815 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2816 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2817 int misalignment;
2818 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2819 continue;
2821 enum dr_alignment_support supportable_dr_alignment
2822 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2823 misalignment);
2824 if (supportable_dr_alignment == dr_unaligned_unsupported)
2826 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2827 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2828 >= (unsigned) param_vect_max_version_for_alignment_checks))
2830 do_versioning = false;
2831 break;
2834 /* At present we don't support versioning for alignment
2835 with variable VF, since there's no guarantee that the
2836 VF is a power of two. We could relax this if we added
2837 a way of enforcing a power-of-two size. */
2838 unsigned HOST_WIDE_INT size;
2839 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2841 do_versioning = false;
2842 break;
2845 /* Forcing alignment in the first iteration is no good if
2846 we don't keep it across iterations. For now, just disable
2847 versioning in this case.
2848 ?? We could actually unroll the loop to achieve the required
2849 overall step alignment, and forcing the alignment could be
2850 done by doing some iterations of the non-vectorized loop. */
2851 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2852 * DR_STEP_ALIGNMENT (dr),
2853 DR_TARGET_ALIGNMENT (dr_info)))
2855 do_versioning = false;
2856 break;
2859 /* The rightmost bits of an aligned address must be zeros.
2860 Construct the mask needed for this test. For example,
2861 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2862 mask must be 15 = 0xf. */
2863 int mask = size - 1;
2865 /* FORNOW: use the same mask to test all potentially unaligned
2866 references in the loop. */
2867 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2868 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2870 do_versioning = false;
2871 break;
2874 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2875 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2879 /* Versioning requires at least one misaligned data reference. */
2880 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2881 do_versioning = false;
2882 else if (!do_versioning)
2883 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2886 if (do_versioning)
2888 const vec<stmt_vec_info> &may_misalign_stmts
2889 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2890 stmt_vec_info stmt_info;
2892 /* It can now be assumed that the data references in the statements
2893 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2894 of the loop being vectorized. */
2895 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2897 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2898 SET_DR_MISALIGNMENT (dr_info,
2899 vect_dr_misalign_for_aligned_access (dr_info));
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_NOTE, vect_location,
2902 "Alignment of access forced using versioning.\n");
2905 if (dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE, vect_location,
2907 "Versioning for alignment will be applied.\n");
2909 /* Peeling and versioning can't be done together at this time. */
2910 gcc_assert (! (do_peeling && do_versioning));
2912 return opt_result::success ();
2915 /* This point is reached if neither peeling nor versioning is being done. */
2916 gcc_assert (! (do_peeling || do_versioning));
2918 return opt_result::success ();
2922 /* Function vect_analyze_data_refs_alignment
2924 Analyze the alignment of the data-references in the loop.
2925 Return FALSE if a data reference is found that cannot be vectorized. */
2927 opt_result
2928 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2930 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2932 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2933 struct data_reference *dr;
2934 unsigned int i;
2936 vect_record_base_alignments (loop_vinfo);
2937 FOR_EACH_VEC_ELT (datarefs, i, dr)
2939 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2940 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2942 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2943 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2944 continue;
2945 opt_result res = opt_result::success ();
2946 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2947 STMT_VINFO_VECTYPE (dr_info->stmt),
2948 &res);
2949 if (!res)
2950 return res;
2954 return opt_result::success ();
2958 /* Analyze alignment of DRs of stmts in NODE. */
2960 static bool
2961 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2963 /* Alignment is maintained in the first element of the group. */
2964 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2965 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2966 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2967 tree vectype = SLP_TREE_VECTYPE (node);
2968 poly_uint64 vector_alignment
2969 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2970 BITS_PER_UNIT);
2971 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2972 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2973 /* Re-analyze alignment when we're facing a vectorization with a bigger
2974 alignment requirement. */
2975 else if (known_lt (dr_info->target_alignment, vector_alignment))
2977 poly_uint64 old_target_alignment = dr_info->target_alignment;
2978 int old_misalignment = dr_info->misalignment;
2979 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2980 /* But keep knowledge about a smaller alignment. */
2981 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2982 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2984 dr_info->target_alignment = old_target_alignment;
2985 dr_info->misalignment = old_misalignment;
2988 /* When we ever face unordered target alignments the first one wins in terms
2989 of analyzing and the other will become unknown in dr_misalignment. */
2990 return true;
2993 /* Function vect_slp_analyze_instance_alignment
2995 Analyze the alignment of the data-references in the SLP instance.
2996 Return FALSE if a data reference is found that cannot be vectorized. */
2998 bool
2999 vect_slp_analyze_instance_alignment (vec_info *vinfo,
3000 slp_instance instance)
3002 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
3004 slp_tree node;
3005 unsigned i;
3006 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
3007 if (! vect_slp_analyze_node_alignment (vinfo, node))
3008 return false;
3010 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
3011 && ! vect_slp_analyze_node_alignment
3012 (vinfo, SLP_INSTANCE_TREE (instance)))
3013 return false;
3015 return true;
3019 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3020 accesses of legal size, step, etc. Detect gaps, single element
3021 interleaving, and other special cases. Set grouped access info.
3022 Collect groups of strided stores for further use in SLP analysis.
3023 Worker for vect_analyze_group_access. */
3025 static bool
3026 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
3028 data_reference *dr = dr_info->dr;
3029 tree step = DR_STEP (dr);
3030 tree scalar_type = TREE_TYPE (DR_REF (dr));
3031 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
3032 stmt_vec_info stmt_info = dr_info->stmt;
3033 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3034 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3035 HOST_WIDE_INT dr_step = -1;
3036 HOST_WIDE_INT groupsize, last_accessed_element = 1;
3037 bool slp_impossible = false;
3039 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
3040 size of the interleaving group (including gaps). */
3041 if (tree_fits_shwi_p (step))
3043 dr_step = tree_to_shwi (step);
3044 /* Check that STEP is a multiple of type size. Otherwise there is
3045 a non-element-sized gap at the end of the group which we
3046 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
3047 ??? As we can handle non-constant step fine here we should
3048 simply remove uses of DR_GROUP_GAP between the last and first
3049 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
3050 simply not include that gap. */
3051 if ((dr_step % type_size) != 0)
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE, vect_location,
3055 "Step %T is not a multiple of the element size"
3056 " for %T\n",
3057 step, DR_REF (dr));
3058 return false;
3060 groupsize = absu_hwi (dr_step) / type_size;
3062 else
3063 groupsize = 0;
3065 /* Not consecutive access is possible only if it is a part of interleaving. */
3066 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
3068 /* Check if it this DR is a part of interleaving, and is a single
3069 element of the group that is accessed in the loop. */
3071 /* Gaps are supported only for loads. STEP must be a multiple of the type
3072 size. */
3073 if (DR_IS_READ (dr)
3074 && (dr_step % type_size) == 0
3075 && groupsize > 0
3076 /* This could be UINT_MAX but as we are generating code in a very
3077 inefficient way we have to cap earlier.
3078 See PR91403 for example. */
3079 && groupsize <= 4096)
3081 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
3082 DR_GROUP_SIZE (stmt_info) = groupsize;
3083 DR_GROUP_GAP (stmt_info) = groupsize - 1;
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location,
3086 "Detected single element interleaving %T"
3087 " step %T\n",
3088 DR_REF (dr), step);
3090 return true;
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095 "not consecutive access %G", stmt_info->stmt);
3097 if (bb_vinfo)
3099 /* Mark the statement as unvectorizable. */
3100 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3101 return true;
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3106 STMT_VINFO_STRIDED_P (stmt_info) = true;
3107 return true;
3110 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3112 /* First stmt in the interleaving chain. Check the chain. */
3113 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3114 struct data_reference *data_ref = dr;
3115 unsigned int count = 1;
3116 tree prev_init = DR_INIT (data_ref);
3117 HOST_WIDE_INT diff, gaps = 0;
3119 /* By construction, all group members have INTEGER_CST DR_INITs. */
3120 while (next)
3122 /* We never have the same DR multiple times. */
3123 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3124 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3126 data_ref = STMT_VINFO_DATA_REF (next);
3128 /* All group members have the same STEP by construction. */
3129 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3131 /* Check that the distance between two accesses is equal to the type
3132 size. Otherwise, we have gaps. */
3133 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3134 - TREE_INT_CST_LOW (prev_init)) / type_size;
3135 if (diff < 1 || diff > UINT_MAX)
3137 /* For artificial testcases with array accesses with large
3138 constant indices we can run into overflow issues which
3139 can end up fooling the groupsize constraint below so
3140 check the individual gaps (which are represented as
3141 unsigned int) as well. */
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144 "interleaved access with gap larger "
3145 "than representable\n");
3146 return false;
3148 if (diff != 1)
3150 /* FORNOW: SLP of accesses with gaps is not supported. */
3151 slp_impossible = true;
3152 if (DR_IS_WRITE (data_ref))
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 "interleaved store with gaps\n");
3157 return false;
3160 gaps += diff - 1;
3163 last_accessed_element += diff;
3165 /* Store the gap from the previous member of the group. If there is no
3166 gap in the access, DR_GROUP_GAP is always 1. */
3167 DR_GROUP_GAP (next) = diff;
3169 prev_init = DR_INIT (data_ref);
3170 next = DR_GROUP_NEXT_ELEMENT (next);
3171 /* Count the number of data-refs in the chain. */
3172 count++;
3175 if (groupsize == 0)
3176 groupsize = count + gaps;
3178 /* This could be UINT_MAX but as we are generating code in a very
3179 inefficient way we have to cap earlier. See PR78699 for example. */
3180 if (groupsize > 4096)
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3184 "group is too large\n");
3185 return false;
3188 /* Check that the size of the interleaving is equal to count for stores,
3189 i.e., that there are no gaps. */
3190 if (groupsize != count
3191 && !DR_IS_READ (dr))
3193 groupsize = count;
3194 STMT_VINFO_STRIDED_P (stmt_info) = true;
3197 /* If there is a gap after the last load in the group it is the
3198 difference between the groupsize and the last accessed
3199 element.
3200 When there is no gap, this difference should be 0. */
3201 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3203 DR_GROUP_SIZE (stmt_info) = groupsize;
3204 if (dump_enabled_p ())
3206 dump_printf_loc (MSG_NOTE, vect_location,
3207 "Detected interleaving ");
3208 if (DR_IS_READ (dr))
3209 dump_printf (MSG_NOTE, "load ");
3210 else if (STMT_VINFO_STRIDED_P (stmt_info))
3211 dump_printf (MSG_NOTE, "strided store ");
3212 else
3213 dump_printf (MSG_NOTE, "store ");
3214 dump_printf (MSG_NOTE, "of size %u\n",
3215 (unsigned)groupsize);
3216 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3217 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3218 while (next)
3220 if (DR_GROUP_GAP (next) != 1)
3221 dump_printf_loc (MSG_NOTE, vect_location,
3222 "\t<gap of %d elements>\n",
3223 DR_GROUP_GAP (next) - 1);
3224 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3225 next = DR_GROUP_NEXT_ELEMENT (next);
3227 if (DR_GROUP_GAP (stmt_info) != 0)
3228 dump_printf_loc (MSG_NOTE, vect_location,
3229 "\t<gap of %d elements>\n",
3230 DR_GROUP_GAP (stmt_info));
3233 /* SLP: create an SLP data structure for every interleaving group of
3234 stores for further analysis in vect_analyse_slp. */
3235 if (DR_IS_WRITE (dr) && !slp_impossible)
3237 if (loop_vinfo)
3238 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3239 if (bb_vinfo)
3240 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3244 return true;
3247 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3248 accesses of legal size, step, etc. Detect gaps, single element
3249 interleaving, and other special cases. Set grouped access info.
3250 Collect groups of strided stores for further use in SLP analysis. */
3252 static bool
3253 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3255 if (!vect_analyze_group_access_1 (vinfo, dr_info))
3257 /* Dissolve the group if present. */
3258 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3259 while (stmt_info)
3261 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3262 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3263 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3264 stmt_info = next;
3266 return false;
3268 return true;
3271 /* Analyze the access pattern of the data-reference DR_INFO.
3272 In case of non-consecutive accesses call vect_analyze_group_access() to
3273 analyze groups of accesses. */
3275 static bool
3276 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3278 data_reference *dr = dr_info->dr;
3279 tree step = DR_STEP (dr);
3280 tree scalar_type = TREE_TYPE (DR_REF (dr));
3281 stmt_vec_info stmt_info = dr_info->stmt;
3282 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3283 class loop *loop = NULL;
3285 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3286 return true;
3288 if (loop_vinfo)
3289 loop = LOOP_VINFO_LOOP (loop_vinfo);
3291 if (loop_vinfo && !step)
3293 if (dump_enabled_p ())
3294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3295 "bad data-ref access in loop\n");
3296 return false;
3299 /* Allow loads with zero step in inner-loop vectorization. */
3300 if (loop_vinfo && integer_zerop (step))
3302 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3303 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3304 if (!nested_in_vect_loop_p (loop, stmt_info))
3305 return DR_IS_READ (dr);
3306 /* Allow references with zero step for outer loops marked
3307 with pragma omp simd only - it guarantees absence of
3308 loop-carried dependencies between inner loop iterations. */
3309 if (loop->safelen < 2)
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_NOTE, vect_location,
3313 "zero step in inner loop of nest\n");
3314 return false;
3318 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3320 /* Interleaved accesses are not yet supported within outer-loop
3321 vectorization for references in the inner-loop. */
3322 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3323 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3325 /* For the rest of the analysis we use the outer-loop step. */
3326 step = STMT_VINFO_DR_STEP (stmt_info);
3327 if (integer_zerop (step))
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location,
3331 "zero step in outer loop.\n");
3332 return DR_IS_READ (dr);
3336 /* Consecutive? */
3337 if (TREE_CODE (step) == INTEGER_CST)
3339 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3340 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3341 || (dr_step < 0
3342 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3344 /* Mark that it is not interleaving. */
3345 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3346 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3347 return true;
3351 if (loop && nested_in_vect_loop_p (loop, stmt_info))
3353 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_NOTE, vect_location,
3355 "grouped access in outer loop.\n");
3356 return false;
3360 /* Assume this is a DR handled by non-constant strided load case. */
3361 if (TREE_CODE (step) != INTEGER_CST)
3362 return (STMT_VINFO_STRIDED_P (stmt_info)
3363 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3364 || vect_analyze_group_access (vinfo, dr_info)));
3366 /* Not consecutive access - check if it's a part of interleaving group. */
3367 return vect_analyze_group_access (vinfo, dr_info);
3370 /* Compare two data-references DRA and DRB to group them into chunks
3371 suitable for grouping. */
3373 static int
3374 dr_group_sort_cmp (const void *dra_, const void *drb_)
3376 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3377 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3378 data_reference_p dra = dra_info->dr;
3379 data_reference_p drb = drb_info->dr;
3380 int cmp;
3382 /* Stabilize sort. */
3383 if (dra == drb)
3384 return 0;
3386 /* Different group IDs lead never belong to the same group. */
3387 if (dra_info->group != drb_info->group)
3388 return dra_info->group < drb_info->group ? -1 : 1;
3390 /* Ordering of DRs according to base. */
3391 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3392 DR_BASE_ADDRESS (drb));
3393 if (cmp != 0)
3394 return cmp;
3396 /* And according to DR_OFFSET. */
3397 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3398 if (cmp != 0)
3399 return cmp;
3401 /* Put reads before writes. */
3402 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3403 return DR_IS_READ (dra) ? -1 : 1;
3405 /* Then sort after access size. */
3406 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3407 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3408 if (cmp != 0)
3409 return cmp;
3411 /* And after step. */
3412 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3413 if (cmp != 0)
3414 return cmp;
3416 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3417 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3418 if (cmp == 0)
3419 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3420 return cmp;
3423 /* If OP is the result of a conversion, return the unconverted value,
3424 otherwise return null. */
3426 static tree
3427 strip_conversion (tree op)
3429 if (TREE_CODE (op) != SSA_NAME)
3430 return NULL_TREE;
3431 gimple *stmt = SSA_NAME_DEF_STMT (op);
3432 if (!is_gimple_assign (stmt)
3433 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3434 return NULL_TREE;
3435 return gimple_assign_rhs1 (stmt);
3438 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3439 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3440 be grouped in SLP mode. */
3442 static bool
3443 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3444 bool allow_slp_p)
3446 if (gimple_assign_single_p (stmt1_info->stmt))
3447 return gimple_assign_single_p (stmt2_info->stmt);
3449 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3450 if (call1 && gimple_call_internal_p (call1))
3452 /* Check for two masked loads or two masked stores. */
3453 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3454 if (!call2 || !gimple_call_internal_p (call2))
3455 return false;
3456 internal_fn ifn = gimple_call_internal_fn (call1);
3457 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3458 return false;
3459 if (ifn != gimple_call_internal_fn (call2))
3460 return false;
3462 /* Check that the masks are the same. Cope with casts of masks,
3463 like those created by build_mask_conversion. */
3464 tree mask1 = gimple_call_arg (call1, 2);
3465 tree mask2 = gimple_call_arg (call2, 2);
3466 if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3468 mask1 = strip_conversion (mask1);
3469 if (!mask1)
3470 return false;
3471 mask2 = strip_conversion (mask2);
3472 if (!mask2)
3473 return false;
3474 if (!operand_equal_p (mask1, mask2, 0))
3475 return false;
3477 return true;
3480 return false;
3483 /* Function vect_analyze_data_ref_accesses.
3485 Analyze the access pattern of all the data references in the loop.
3487 FORNOW: the only access pattern that is considered vectorizable is a
3488 simple step 1 (consecutive) access.
3490 FORNOW: handle only arrays and pointer accesses. */
3492 opt_result
3493 vect_analyze_data_ref_accesses (vec_info *vinfo,
3494 vec<int> *dataref_groups)
3496 unsigned int i;
3497 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3499 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3501 if (datarefs.is_empty ())
3502 return opt_result::success ();
3504 /* Sort the array of datarefs to make building the interleaving chains
3505 linear. Don't modify the original vector's order, it is needed for
3506 determining what dependencies are reversed. */
3507 vec<dr_vec_info *> datarefs_copy;
3508 datarefs_copy.create (datarefs.length ());
3509 for (unsigned i = 0; i < datarefs.length (); i++)
3511 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3512 /* If the caller computed DR grouping use that, otherwise group by
3513 basic blocks. */
3514 if (dataref_groups)
3515 dr_info->group = (*dataref_groups)[i];
3516 else
3517 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3518 datarefs_copy.quick_push (dr_info);
3520 datarefs_copy.qsort (dr_group_sort_cmp);
3521 hash_set<stmt_vec_info> to_fixup;
3523 /* Build the interleaving chains. */
3524 for (i = 0; i < datarefs_copy.length () - 1;)
3526 dr_vec_info *dr_info_a = datarefs_copy[i];
3527 data_reference_p dra = dr_info_a->dr;
3528 int dra_group_id = dr_info_a->group;
3529 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3530 stmt_vec_info lastinfo = NULL;
3531 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3532 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3534 ++i;
3535 continue;
3537 for (i = i + 1; i < datarefs_copy.length (); ++i)
3539 dr_vec_info *dr_info_b = datarefs_copy[i];
3540 data_reference_p drb = dr_info_b->dr;
3541 int drb_group_id = dr_info_b->group;
3542 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3543 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3544 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3545 break;
3547 /* ??? Imperfect sorting (non-compatible types, non-modulo
3548 accesses, same accesses) can lead to a group to be artificially
3549 split here as we don't just skip over those. If it really
3550 matters we can push those to a worklist and re-iterate
3551 over them. The we can just skip ahead to the next DR here. */
3553 /* DRs in a different DR group should not be put into the same
3554 interleaving group. */
3555 if (dra_group_id != drb_group_id)
3556 break;
3558 /* Check that the data-refs have same first location (except init)
3559 and they are both either store or load (not load and store,
3560 not masked loads or stores). */
3561 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3562 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3563 DR_BASE_ADDRESS (drb)) != 0
3564 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3565 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3566 break;
3568 /* Check that the data-refs have the same constant size. */
3569 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3570 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3571 if (!tree_fits_uhwi_p (sza)
3572 || !tree_fits_uhwi_p (szb)
3573 || !tree_int_cst_equal (sza, szb))
3574 break;
3576 /* Check that the data-refs have the same step. */
3577 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3578 break;
3580 /* Check the types are compatible.
3581 ??? We don't distinguish this during sorting. */
3582 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3583 TREE_TYPE (DR_REF (drb))))
3584 break;
3586 /* Check that the DR_INITs are compile-time constants. */
3587 if (!tree_fits_shwi_p (DR_INIT (dra))
3588 || !tree_fits_shwi_p (DR_INIT (drb)))
3589 break;
3591 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3592 just hold extra information. */
3593 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3594 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3595 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3596 break;
3598 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3599 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3600 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3601 HOST_WIDE_INT init_prev
3602 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3603 gcc_assert (init_a <= init_b
3604 && init_a <= init_prev
3605 && init_prev <= init_b);
3607 /* Do not place the same access in the interleaving chain twice. */
3608 if (init_b == init_prev)
3610 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3611 < gimple_uid (DR_STMT (drb)));
3612 /* Simply link in duplicates and fix up the chain below. */
3614 else
3616 /* If init_b == init_a + the size of the type * k, we have an
3617 interleaving, and DRA is accessed before DRB. */
3618 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3619 if (type_size_a == 0
3620 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3621 % type_size_a != 0))
3622 break;
3624 /* If we have a store, the accesses are adjacent. This splits
3625 groups into chunks we support (we don't support vectorization
3626 of stores with gaps). */
3627 if (!DR_IS_READ (dra)
3628 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3629 != type_size_a))
3630 break;
3632 /* If the step (if not zero or non-constant) is smaller than the
3633 difference between data-refs' inits this splits groups into
3634 suitable sizes. */
3635 if (tree_fits_shwi_p (DR_STEP (dra)))
3637 unsigned HOST_WIDE_INT step
3638 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3639 if (step != 0
3640 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3641 break;
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE, vect_location,
3647 DR_IS_READ (dra)
3648 ? "Detected interleaving load %T and %T\n"
3649 : "Detected interleaving store %T and %T\n",
3650 DR_REF (dra), DR_REF (drb));
3652 /* Link the found element into the group list. */
3653 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3655 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3656 lastinfo = stmtinfo_a;
3658 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3659 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3660 lastinfo = stmtinfo_b;
3662 if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3664 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3665 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3667 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3668 dump_printf_loc (MSG_NOTE, vect_location,
3669 "Load suitable for SLP vectorization only.\n");
3672 if (init_b == init_prev
3673 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3674 && dump_enabled_p ())
3675 dump_printf_loc (MSG_NOTE, vect_location,
3676 "Queuing group with duplicate access for fixup\n");
3680 /* Fixup groups with duplicate entries by splitting it. */
3681 while (1)
3683 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3684 if (!(it != to_fixup.end ()))
3685 break;
3686 stmt_vec_info grp = *it;
3687 to_fixup.remove (grp);
3689 /* Find the earliest duplicate group member. */
3690 unsigned first_duplicate = -1u;
3691 stmt_vec_info next, g = grp;
3692 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3694 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3695 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3696 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3697 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3698 g = next;
3700 if (first_duplicate == -1U)
3701 continue;
3703 /* Then move all stmts after the first duplicate to a new group.
3704 Note this is a heuristic but one with the property that *it
3705 is fixed up completely. */
3706 g = grp;
3707 stmt_vec_info newgroup = NULL, ng = grp;
3708 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3710 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3712 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3713 if (!newgroup)
3715 newgroup = next;
3716 STMT_VINFO_SLP_VECT_ONLY (newgroup)
3717 = STMT_VINFO_SLP_VECT_ONLY (grp);
3719 else
3720 DR_GROUP_NEXT_ELEMENT (ng) = next;
3721 ng = next;
3722 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3724 else
3725 g = DR_GROUP_NEXT_ELEMENT (g);
3727 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3729 /* Fixup the new group which still may contain duplicates. */
3730 to_fixup.add (newgroup);
3733 dr_vec_info *dr_info;
3734 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3736 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3737 && !vect_analyze_data_ref_access (vinfo, dr_info))
3739 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 "not vectorized: complicated access pattern.\n");
3743 if (is_a <bb_vec_info> (vinfo))
3745 /* Mark the statement as not vectorizable. */
3746 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3747 continue;
3749 else
3751 datarefs_copy.release ();
3752 return opt_result::failure_at (dr_info->stmt->stmt,
3753 "not vectorized:"
3754 " complicated access pattern.\n");
3759 datarefs_copy.release ();
3760 return opt_result::success ();
3763 /* Function vect_vfa_segment_size.
3765 Input:
3766 DR_INFO: The data reference.
3767 LENGTH_FACTOR: segment length to consider.
3769 Return a value suitable for the dr_with_seg_len::seg_len field.
3770 This is the "distance travelled" by the pointer from the first
3771 iteration in the segment to the last. Note that it does not include
3772 the size of the access; in effect it only describes the first byte. */
3774 static tree
3775 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3777 length_factor = size_binop (MINUS_EXPR,
3778 fold_convert (sizetype, length_factor),
3779 size_one_node);
3780 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3781 length_factor);
3784 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3785 gives the worst-case number of bytes covered by the segment. */
3787 static unsigned HOST_WIDE_INT
3788 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3790 stmt_vec_info stmt_vinfo = dr_info->stmt;
3791 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3792 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3793 unsigned HOST_WIDE_INT access_size = ref_size;
3794 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3796 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3797 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3799 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3800 int misalignment;
3801 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3802 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3803 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3804 == dr_explicit_realign_optimized))
3806 /* We might access a full vector's worth. */
3807 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3809 return access_size;
3812 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3813 describes. */
3815 static unsigned int
3816 vect_vfa_align (dr_vec_info *dr_info)
3818 return dr_alignment (dr_info->dr);
3821 /* Function vect_no_alias_p.
3823 Given data references A and B with equal base and offset, see whether
3824 the alias relation can be decided at compilation time. Return 1 if
3825 it can and the references alias, 0 if it can and the references do
3826 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3827 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3828 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3830 static int
3831 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3832 tree segment_length_a, tree segment_length_b,
3833 unsigned HOST_WIDE_INT access_size_a,
3834 unsigned HOST_WIDE_INT access_size_b)
3836 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3837 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3838 poly_uint64 const_length_a;
3839 poly_uint64 const_length_b;
3841 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3842 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3843 [a, a+12) */
3844 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3846 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3847 offset_a -= const_length_a;
3849 else
3850 const_length_a = tree_to_poly_uint64 (segment_length_a);
3851 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3853 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3854 offset_b -= const_length_b;
3856 else
3857 const_length_b = tree_to_poly_uint64 (segment_length_b);
3859 const_length_a += access_size_a;
3860 const_length_b += access_size_b;
3862 if (ranges_known_overlap_p (offset_a, const_length_a,
3863 offset_b, const_length_b))
3864 return 1;
3866 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3867 offset_b, const_length_b))
3868 return 0;
3870 return -1;
3873 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3874 in DDR is >= VF. */
3876 static bool
3877 dependence_distance_ge_vf (data_dependence_relation *ddr,
3878 unsigned int loop_depth, poly_uint64 vf)
3880 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3881 || DDR_NUM_DIST_VECTS (ddr) == 0)
3882 return false;
3884 /* If the dependence is exact, we should have limited the VF instead. */
3885 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3887 unsigned int i;
3888 lambda_vector dist_v;
3889 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3891 HOST_WIDE_INT dist = dist_v[loop_depth];
3892 if (dist != 0
3893 && !(dist > 0 && DDR_REVERSED_P (ddr))
3894 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3895 return false;
3898 if (dump_enabled_p ())
3899 dump_printf_loc (MSG_NOTE, vect_location,
3900 "dependence distance between %T and %T is >= VF\n",
3901 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3903 return true;
3906 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3908 static void
3909 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3911 dump_printf (dump_kind, "%s (%T) >= ",
3912 lower_bound.unsigned_p ? "unsigned" : "abs",
3913 lower_bound.expr);
3914 dump_dec (dump_kind, lower_bound.min_value);
3917 /* Record that the vectorized loop requires the vec_lower_bound described
3918 by EXPR, UNSIGNED_P and MIN_VALUE. */
3920 static void
3921 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3922 poly_uint64 min_value)
3924 vec<vec_lower_bound> &lower_bounds
3925 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3926 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3927 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3929 unsigned_p &= lower_bounds[i].unsigned_p;
3930 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3931 if (lower_bounds[i].unsigned_p != unsigned_p
3932 || maybe_lt (lower_bounds[i].min_value, min_value))
3934 lower_bounds[i].unsigned_p = unsigned_p;
3935 lower_bounds[i].min_value = min_value;
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE, vect_location,
3939 "updating run-time check to ");
3940 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3941 dump_printf (MSG_NOTE, "\n");
3944 return;
3947 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3948 if (dump_enabled_p ())
3950 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3951 dump_lower_bound (MSG_NOTE, lower_bound);
3952 dump_printf (MSG_NOTE, "\n");
3954 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3957 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3958 will span fewer than GAP bytes. */
3960 static bool
3961 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3962 poly_int64 gap)
3964 stmt_vec_info stmt_info = dr_info->stmt;
3965 HOST_WIDE_INT count
3966 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3967 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3968 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3969 return (estimated_poly_value (gap)
3970 <= count * vect_get_scalar_dr_size (dr_info));
3973 /* Return true if we know that there is no alias between DR_INFO_A and
3974 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3975 When returning true, set *LOWER_BOUND_OUT to this N. */
3977 static bool
3978 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3979 poly_uint64 *lower_bound_out)
3981 /* Check that there is a constant gap of known sign between DR_A
3982 and DR_B. */
3983 data_reference *dr_a = dr_info_a->dr;
3984 data_reference *dr_b = dr_info_b->dr;
3985 poly_int64 init_a, init_b;
3986 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3987 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3988 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3989 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3990 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3991 || !ordered_p (init_a, init_b))
3992 return false;
3994 /* Sort DR_A and DR_B by the address they access. */
3995 if (maybe_lt (init_b, init_a))
3997 std::swap (init_a, init_b);
3998 std::swap (dr_info_a, dr_info_b);
3999 std::swap (dr_a, dr_b);
4002 /* If the two accesses could be dependent within a scalar iteration,
4003 make sure that we'd retain their order. */
4004 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
4005 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
4006 return false;
4008 /* There is no alias if abs (DR_STEP) is greater than or equal to
4009 the bytes spanned by the combination of the two accesses. */
4010 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
4011 return true;
4014 /* Function vect_prune_runtime_alias_test_list.
4016 Prune a list of ddrs to be tested at run-time by versioning for alias.
4017 Merge several alias checks into one if possible.
4018 Return FALSE if resulting list of ddrs is longer then allowed by
4019 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
4021 opt_result
4022 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
4024 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
4025 hash_set <tree_pair_hash> compared_objects;
4027 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
4028 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
4029 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4030 const vec<vec_object_pair> &check_unequal_addrs
4031 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4032 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4033 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
4035 ddr_p ddr;
4036 unsigned int i;
4037 tree length_factor;
4039 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
4041 /* Step values are irrelevant for aliasing if the number of vector
4042 iterations is equal to the number of scalar iterations (which can
4043 happen for fully-SLP loops). */
4044 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
4046 if (!vf_one_p)
4048 /* Convert the checks for nonzero steps into bound tests. */
4049 tree value;
4050 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
4051 vect_check_lower_bound (loop_vinfo, value, true, 1);
4054 if (may_alias_ddrs.is_empty ())
4055 return opt_result::success ();
4057 comp_alias_ddrs.create (may_alias_ddrs.length ());
4059 unsigned int loop_depth
4060 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
4061 LOOP_VINFO_LOOP_NEST (loop_vinfo));
4063 /* First, we collect all data ref pairs for aliasing checks. */
4064 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
4066 poly_uint64 lower_bound;
4067 tree segment_length_a, segment_length_b;
4068 unsigned HOST_WIDE_INT access_size_a, access_size_b;
4069 unsigned int align_a, align_b;
4071 /* Ignore the alias if the VF we chose ended up being no greater
4072 than the dependence distance. */
4073 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
4074 continue;
4076 if (DDR_OBJECT_A (ddr))
4078 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
4079 if (!compared_objects.add (new_pair))
4081 if (dump_enabled_p ())
4082 dump_printf_loc (MSG_NOTE, vect_location,
4083 "checking that %T and %T"
4084 " have different addresses\n",
4085 new_pair.first, new_pair.second);
4086 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
4088 continue;
4091 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
4092 stmt_vec_info stmt_info_a = dr_info_a->stmt;
4094 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
4095 stmt_vec_info stmt_info_b = dr_info_b->stmt;
4097 bool preserves_scalar_order_p
4098 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
4099 bool ignore_step_p
4100 = (vf_one_p
4101 && (preserves_scalar_order_p
4102 || operand_equal_p (DR_STEP (dr_info_a->dr),
4103 DR_STEP (dr_info_b->dr))));
4105 /* Skip the pair if inter-iteration dependencies are irrelevant
4106 and intra-iteration dependencies are guaranteed to be honored. */
4107 if (ignore_step_p
4108 && (preserves_scalar_order_p
4109 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4110 &lower_bound)))
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_NOTE, vect_location,
4114 "no need for alias check between "
4115 "%T and %T when VF is 1\n",
4116 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4117 continue;
4120 /* See whether we can handle the alias using a bounds check on
4121 the step, and whether that's likely to be the best approach.
4122 (It might not be, for example, if the minimum step is much larger
4123 than the number of bytes handled by one vector iteration.) */
4124 if (!ignore_step_p
4125 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4126 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4127 &lower_bound)
4128 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4129 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4131 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4132 if (dump_enabled_p ())
4134 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4135 "%T and %T when the step %T is outside ",
4136 DR_REF (dr_info_a->dr),
4137 DR_REF (dr_info_b->dr),
4138 DR_STEP (dr_info_a->dr));
4139 if (unsigned_p)
4140 dump_printf (MSG_NOTE, "[0");
4141 else
4143 dump_printf (MSG_NOTE, "(");
4144 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4146 dump_printf (MSG_NOTE, ", ");
4147 dump_dec (MSG_NOTE, lower_bound);
4148 dump_printf (MSG_NOTE, ")\n");
4150 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4151 unsigned_p, lower_bound);
4152 continue;
4155 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4156 if (dr_group_first_a)
4158 stmt_info_a = dr_group_first_a;
4159 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4162 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4163 if (dr_group_first_b)
4165 stmt_info_b = dr_group_first_b;
4166 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4169 if (ignore_step_p)
4171 segment_length_a = size_zero_node;
4172 segment_length_b = size_zero_node;
4174 else
4176 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4177 DR_STEP (dr_info_b->dr), 0))
4178 length_factor = scalar_loop_iters;
4179 else
4180 length_factor = size_int (vect_factor);
4181 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4182 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4184 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4185 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4186 align_a = vect_vfa_align (dr_info_a);
4187 align_b = vect_vfa_align (dr_info_b);
4189 /* See whether the alias is known at compilation time. */
4190 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4191 DR_BASE_ADDRESS (dr_info_b->dr), 0)
4192 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4193 DR_OFFSET (dr_info_b->dr), 0)
4194 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4195 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4196 && poly_int_tree_p (segment_length_a)
4197 && poly_int_tree_p (segment_length_b))
4199 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4200 segment_length_a,
4201 segment_length_b,
4202 access_size_a,
4203 access_size_b);
4204 if (res >= 0 && dump_enabled_p ())
4206 dump_printf_loc (MSG_NOTE, vect_location,
4207 "can tell at compile time that %T and %T",
4208 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4209 if (res == 0)
4210 dump_printf (MSG_NOTE, " do not alias\n");
4211 else
4212 dump_printf (MSG_NOTE, " alias\n");
4215 if (res == 0)
4216 continue;
4218 if (res == 1)
4219 return opt_result::failure_at (stmt_info_b->stmt,
4220 "not vectorized:"
4221 " compilation time alias: %G%G",
4222 stmt_info_a->stmt,
4223 stmt_info_b->stmt);
4226 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4227 access_size_a, align_a);
4228 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4229 access_size_b, align_b);
4230 /* Canonicalize the order to be the one that's needed for accurate
4231 RAW, WAR and WAW flags, in cases where the data references are
4232 well-ordered. The order doesn't really matter otherwise,
4233 but we might as well be consistent. */
4234 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4235 std::swap (dr_a, dr_b);
4237 dr_with_seg_len_pair_t dr_with_seg_len_pair
4238 (dr_a, dr_b, (preserves_scalar_order_p
4239 ? dr_with_seg_len_pair_t::WELL_ORDERED
4240 : dr_with_seg_len_pair_t::REORDERED));
4242 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4245 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4247 unsigned int count = (comp_alias_ddrs.length ()
4248 + check_unequal_addrs.length ());
4250 if (count
4251 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4252 == VECT_COST_MODEL_VERY_CHEAP))
4253 return opt_result::failure_at
4254 (vect_location, "would need a runtime alias check\n");
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_NOTE, vect_location,
4258 "improved number of alias checks from %d to %d\n",
4259 may_alias_ddrs.length (), count);
4260 unsigned limit = param_vect_max_version_for_alias_checks;
4261 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4262 limit = param_vect_max_version_for_alias_checks * 6 / 10;
4263 if (count > limit)
4264 return opt_result::failure_at
4265 (vect_location,
4266 "number of versioning for alias run-time tests exceeds %d "
4267 "(--param vect-max-version-for-alias-checks)\n", limit);
4269 return opt_result::success ();
4272 /* Check whether we can use an internal function for a gather load
4273 or scatter store. READ_P is true for loads and false for stores.
4274 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4275 the type of the memory elements being loaded or stored. OFFSET_TYPE
4276 is the type of the offset that is being applied to the invariant
4277 base address. SCALE is the amount by which the offset should
4278 be multiplied *after* it has been converted to address width.
4280 Return true if the function is supported, storing the function id in
4281 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
4283 If we can use gather and store the possible else values in ELSVALS. */
4285 bool
4286 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4287 tree vectype, tree memory_type, tree offset_type,
4288 int scale, internal_fn *ifn_out,
4289 tree *offset_vectype_out, vec<int> *elsvals)
4291 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4292 unsigned int element_bits = vector_element_bits (vectype);
4293 if (element_bits != memory_bits)
4294 /* For now the vector elements must be the same width as the
4295 memory elements. */
4296 return false;
4298 /* Work out which function we need. */
4299 internal_fn ifn, alt_ifn, alt_ifn2;
4300 if (read_p)
4302 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4303 alt_ifn = IFN_MASK_GATHER_LOAD;
4304 /* When target supports MASK_LEN_GATHER_LOAD, we always
4305 use MASK_LEN_GATHER_LOAD regardless whether len and
4306 mask are valid or not. */
4307 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4309 else
4311 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4312 alt_ifn = IFN_MASK_SCATTER_STORE;
4313 /* When target supports MASK_LEN_SCATTER_STORE, we always
4314 use MASK_LEN_SCATTER_STORE regardless whether len and
4315 mask are valid or not. */
4316 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4319 for (;;)
4321 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4322 if (!offset_vectype)
4323 return false;
4325 /* Test whether the target supports this combination. */
4326 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4327 offset_vectype, scale,
4328 elsvals))
4330 *ifn_out = ifn;
4331 *offset_vectype_out = offset_vectype;
4332 return true;
4334 else if (!masked_p
4335 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4336 memory_type,
4337 offset_vectype,
4338 scale, elsvals))
4340 *ifn_out = alt_ifn;
4341 *offset_vectype_out = offset_vectype;
4342 return true;
4344 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4345 memory_type,
4346 offset_vectype, scale,
4347 elsvals))
4349 *ifn_out = alt_ifn2;
4350 *offset_vectype_out = offset_vectype;
4351 return true;
4354 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4355 && TYPE_PRECISION (offset_type) >= element_bits)
4356 return false;
4358 offset_type = build_nonstandard_integer_type
4359 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4363 /* STMT_INFO is a call to an internal gather load or scatter store function.
4364 Describe the operation in INFO. */
4366 static void
4367 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4368 gather_scatter_info *info)
4370 gcall *call = as_a <gcall *> (stmt_info->stmt);
4371 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4372 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4374 info->ifn = gimple_call_internal_fn (call);
4375 info->decl = NULL_TREE;
4376 info->base = gimple_call_arg (call, 0);
4377 info->offset = gimple_call_arg (call, 1);
4378 info->offset_dt = vect_unknown_def_type;
4379 info->offset_vectype = NULL_TREE;
4380 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4381 info->element_type = TREE_TYPE (vectype);
4382 info->memory_type = TREE_TYPE (DR_REF (dr));
4385 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4386 gather load or scatter store. Describe the operation in *INFO if so.
4387 If it is suitable and ELSVALS is nonzero store the supported else values
4388 in the vector it points to. */
4390 bool
4391 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4392 gather_scatter_info *info, vec<int> *elsvals)
4394 HOST_WIDE_INT scale = 1;
4395 poly_int64 pbitpos, pbitsize;
4396 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398 tree offtype = NULL_TREE;
4399 tree decl = NULL_TREE, base, off;
4400 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4401 tree memory_type = TREE_TYPE (DR_REF (dr));
4402 machine_mode pmode;
4403 int punsignedp, reversep, pvolatilep = 0;
4404 internal_fn ifn;
4405 tree offset_vectype;
4406 bool masked_p = false;
4408 /* See whether this is already a call to a gather/scatter internal function.
4409 If not, see whether it's a masked load or store. */
4410 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4411 if (call && gimple_call_internal_p (call))
4413 ifn = gimple_call_internal_fn (call);
4414 if (internal_gather_scatter_fn_p (ifn))
4416 vect_describe_gather_scatter_call (stmt_info, info);
4418 /* In pattern recog we simply used a ZERO else value that
4419 we need to correct here. To that end just re-use the
4420 (already succesful) check if we support a gather IFN
4421 and have it populate the else values. */
4422 if (DR_IS_READ (dr) && internal_fn_mask_index (ifn) >= 0 && elsvals)
4423 supports_vec_gather_load_p (TYPE_MODE (vectype), elsvals);
4424 return true;
4426 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4429 /* ??? For epilogues we adjust DR_REF to make the following stmt-based
4430 analysis work, but this adjustment doesn't work for epilogues of
4431 epilogues during transform, so disable gather/scatter in that case. */
4432 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4433 && LOOP_VINFO_EPILOGUE_P (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)))
4434 return false;
4436 /* True if we should aim to use internal functions rather than
4437 built-in functions. */
4438 bool use_ifn_p = (DR_IS_READ (dr)
4439 ? supports_vec_gather_load_p (TYPE_MODE (vectype),
4440 elsvals)
4441 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4443 base = DR_REF (dr);
4444 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4445 see if we can use the def stmt of the address. */
4446 if (masked_p
4447 && TREE_CODE (base) == MEM_REF
4448 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4449 && integer_zerop (TREE_OPERAND (base, 1))
4450 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4452 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4453 if (is_gimple_assign (def_stmt)
4454 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4455 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4458 /* The gather and scatter builtins need address of the form
4459 loop_invariant + vector * {1, 2, 4, 8}
4461 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4462 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4463 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4464 multiplications and additions in it. To get a vector, we need
4465 a single SSA_NAME that will be defined in the loop and will
4466 contain everything that is not loop invariant and that can be
4467 vectorized. The following code attempts to find such a preexistng
4468 SSA_NAME OFF and put the loop invariants into a tree BASE
4469 that can be gimplified before the loop. */
4470 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4471 &punsignedp, &reversep, &pvolatilep);
4472 if (reversep)
4473 return false;
4475 /* PR 107346. Packed structs can have fields at offsets that are not
4476 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4477 if (!multiple_p (pbitpos, BITS_PER_UNIT))
4478 return false;
4480 /* We need to be able to form an address to the base which for example
4481 isn't possible for hard registers. */
4482 if (may_be_nonaddressable_p (base))
4483 return false;
4485 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4487 if (TREE_CODE (base) == MEM_REF)
4489 if (!integer_zerop (TREE_OPERAND (base, 1)))
4491 if (off == NULL_TREE)
4492 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4493 else
4494 off = size_binop (PLUS_EXPR, off,
4495 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4497 base = TREE_OPERAND (base, 0);
4499 else
4500 base = build_fold_addr_expr (base);
4502 if (off == NULL_TREE)
4503 off = size_zero_node;
4505 /* If base is not loop invariant, either off is 0, then we start with just
4506 the constant offset in the loop invariant BASE and continue with base
4507 as OFF, otherwise give up.
4508 We could handle that case by gimplifying the addition of base + off
4509 into some SSA_NAME and use that as off, but for now punt. */
4510 if (!expr_invariant_in_loop_p (loop, base))
4512 if (!integer_zerop (off))
4513 return false;
4514 off = base;
4515 base = size_int (pbytepos);
4517 /* Otherwise put base + constant offset into the loop invariant BASE
4518 and continue with OFF. */
4519 else
4521 base = fold_convert (sizetype, base);
4522 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4525 /* OFF at this point may be either a SSA_NAME or some tree expression
4526 from get_inner_reference. Try to peel off loop invariants from it
4527 into BASE as long as possible. */
4528 STRIP_NOPS (off);
4529 while (offtype == NULL_TREE)
4531 enum tree_code code;
4532 tree op0, op1, add = NULL_TREE;
4534 if (TREE_CODE (off) == SSA_NAME)
4536 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4538 if (expr_invariant_in_loop_p (loop, off))
4539 return false;
4541 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4542 break;
4544 op0 = gimple_assign_rhs1 (def_stmt);
4545 code = gimple_assign_rhs_code (def_stmt);
4546 op1 = gimple_assign_rhs2 (def_stmt);
4548 else
4550 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4551 return false;
4552 code = TREE_CODE (off);
4553 extract_ops_from_tree (off, &code, &op0, &op1);
4555 switch (code)
4557 case POINTER_PLUS_EXPR:
4558 case PLUS_EXPR:
4559 if (expr_invariant_in_loop_p (loop, op0))
4561 add = op0;
4562 off = op1;
4563 do_add:
4564 add = fold_convert (sizetype, add);
4565 if (scale != 1)
4566 add = size_binop (MULT_EXPR, add, size_int (scale));
4567 base = size_binop (PLUS_EXPR, base, add);
4568 continue;
4570 if (expr_invariant_in_loop_p (loop, op1))
4572 add = op1;
4573 off = op0;
4574 goto do_add;
4576 break;
4577 case MINUS_EXPR:
4578 if (expr_invariant_in_loop_p (loop, op1))
4580 add = fold_convert (sizetype, op1);
4581 add = size_binop (MINUS_EXPR, size_zero_node, add);
4582 off = op0;
4583 goto do_add;
4585 break;
4586 case MULT_EXPR:
4587 if (scale == 1 && tree_fits_shwi_p (op1))
4589 int new_scale = tree_to_shwi (op1);
4590 /* Only treat this as a scaling operation if the target
4591 supports it for at least some offset type. */
4592 if (use_ifn_p
4593 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4594 masked_p, vectype, memory_type,
4595 signed_char_type_node,
4596 new_scale, &ifn,
4597 &offset_vectype,
4598 elsvals)
4599 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4600 masked_p, vectype, memory_type,
4601 unsigned_char_type_node,
4602 new_scale, &ifn,
4603 &offset_vectype,
4604 elsvals))
4605 break;
4606 scale = new_scale;
4607 off = op0;
4608 continue;
4610 break;
4611 case SSA_NAME:
4612 off = op0;
4613 continue;
4614 CASE_CONVERT:
4615 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4616 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4617 break;
4619 /* Don't include the conversion if the target is happy with
4620 the current offset type. */
4621 if (use_ifn_p
4622 && TREE_CODE (off) == SSA_NAME
4623 && !POINTER_TYPE_P (TREE_TYPE (off))
4624 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4625 masked_p, vectype, memory_type,
4626 TREE_TYPE (off), scale, &ifn,
4627 &offset_vectype, elsvals))
4628 break;
4630 if (TYPE_PRECISION (TREE_TYPE (op0))
4631 == TYPE_PRECISION (TREE_TYPE (off)))
4633 off = op0;
4634 continue;
4637 /* Include the conversion if it is widening and we're using
4638 the IFN path or the target can handle the converted from
4639 offset or the current size is not already the same as the
4640 data vector element size. */
4641 if ((TYPE_PRECISION (TREE_TYPE (op0))
4642 < TYPE_PRECISION (TREE_TYPE (off)))
4643 && (use_ifn_p
4644 || (DR_IS_READ (dr)
4645 ? (targetm.vectorize.builtin_gather
4646 && targetm.vectorize.builtin_gather (vectype,
4647 TREE_TYPE (op0),
4648 scale))
4649 : (targetm.vectorize.builtin_scatter
4650 && targetm.vectorize.builtin_scatter (vectype,
4651 TREE_TYPE (op0),
4652 scale)))
4653 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4654 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4656 off = op0;
4657 offtype = TREE_TYPE (off);
4658 STRIP_NOPS (off);
4659 continue;
4661 break;
4662 default:
4663 break;
4665 break;
4668 /* If at the end OFF still isn't a SSA_NAME or isn't
4669 defined in the loop, punt. */
4670 if (TREE_CODE (off) != SSA_NAME
4671 || expr_invariant_in_loop_p (loop, off))
4672 return false;
4674 if (offtype == NULL_TREE)
4675 offtype = TREE_TYPE (off);
4677 if (use_ifn_p)
4679 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4680 vectype, memory_type, offtype, scale,
4681 &ifn, &offset_vectype, elsvals))
4682 ifn = IFN_LAST;
4683 decl = NULL_TREE;
4685 else
4687 if (DR_IS_READ (dr))
4689 if (targetm.vectorize.builtin_gather)
4690 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4692 else
4694 if (targetm.vectorize.builtin_scatter)
4695 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4697 ifn = IFN_LAST;
4698 /* The offset vector type will be read from DECL when needed. */
4699 offset_vectype = NULL_TREE;
4702 info->ifn = ifn;
4703 info->decl = decl;
4704 info->base = base;
4705 info->offset = off;
4706 info->offset_dt = vect_unknown_def_type;
4707 info->offset_vectype = offset_vectype;
4708 info->scale = scale;
4709 info->element_type = TREE_TYPE (vectype);
4710 info->memory_type = memory_type;
4711 return true;
4714 /* Find the data references in STMT, analyze them with respect to LOOP and
4715 append them to DATAREFS. Return false if datarefs in this stmt cannot
4716 be handled. */
4718 opt_result
4719 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4720 vec<data_reference_p> *datarefs,
4721 vec<int> *dataref_groups, int group_id)
4723 /* We can ignore clobbers for dataref analysis - they are removed during
4724 loop vectorization and BB vectorization checks dependences with a
4725 stmt walk. */
4726 if (gimple_clobber_p (stmt))
4727 return opt_result::success ();
4729 if (gimple_has_volatile_ops (stmt))
4730 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4731 stmt);
4733 if (stmt_can_throw_internal (cfun, stmt))
4734 return opt_result::failure_at (stmt,
4735 "not vectorized:"
4736 " statement can throw an exception: %G",
4737 stmt);
4739 auto_vec<data_reference_p, 2> refs;
4740 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4741 if (!res)
4742 return res;
4744 if (refs.is_empty ())
4745 return opt_result::success ();
4747 if (refs.length () > 1)
4749 while (!refs.is_empty ())
4750 free_data_ref (refs.pop ());
4751 return opt_result::failure_at (stmt,
4752 "not vectorized: more than one "
4753 "data ref in stmt: %G", stmt);
4756 data_reference_p dr = refs.pop ();
4757 if (gcall *call = dyn_cast <gcall *> (stmt))
4758 if (!gimple_call_internal_p (call)
4759 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4760 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4762 free_data_ref (dr);
4763 return opt_result::failure_at (stmt,
4764 "not vectorized: dr in a call %G", stmt);
4767 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4768 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4770 free_data_ref (dr);
4771 return opt_result::failure_at (stmt,
4772 "not vectorized:"
4773 " statement is an unsupported"
4774 " bitfield access %G", stmt);
4777 if (DR_BASE_ADDRESS (dr)
4778 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4780 free_data_ref (dr);
4781 return opt_result::failure_at (stmt,
4782 "not vectorized:"
4783 " base addr of dr is a constant\n");
4786 /* Check whether this may be a SIMD lane access and adjust the
4787 DR to make it easier for us to handle it. */
4788 if (loop
4789 && loop->simduid
4790 && (!DR_BASE_ADDRESS (dr)
4791 || !DR_OFFSET (dr)
4792 || !DR_INIT (dr)
4793 || !DR_STEP (dr)))
4795 struct data_reference *newdr
4796 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4797 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4798 if (DR_BASE_ADDRESS (newdr)
4799 && DR_OFFSET (newdr)
4800 && DR_INIT (newdr)
4801 && DR_STEP (newdr)
4802 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4803 && integer_zerop (DR_STEP (newdr)))
4805 tree base_address = DR_BASE_ADDRESS (newdr);
4806 tree off = DR_OFFSET (newdr);
4807 tree step = ssize_int (1);
4808 if (integer_zerop (off)
4809 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4811 off = TREE_OPERAND (base_address, 1);
4812 base_address = TREE_OPERAND (base_address, 0);
4814 STRIP_NOPS (off);
4815 if (TREE_CODE (off) == MULT_EXPR
4816 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4818 step = TREE_OPERAND (off, 1);
4819 off = TREE_OPERAND (off, 0);
4820 STRIP_NOPS (off);
4822 if (CONVERT_EXPR_P (off)
4823 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4824 < TYPE_PRECISION (TREE_TYPE (off))))
4825 off = TREE_OPERAND (off, 0);
4826 if (TREE_CODE (off) == SSA_NAME)
4828 gimple *def = SSA_NAME_DEF_STMT (off);
4829 /* Look through widening conversion. */
4830 if (is_gimple_assign (def)
4831 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4833 tree rhs1 = gimple_assign_rhs1 (def);
4834 if (TREE_CODE (rhs1) == SSA_NAME
4835 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4836 && (TYPE_PRECISION (TREE_TYPE (off))
4837 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4838 def = SSA_NAME_DEF_STMT (rhs1);
4840 if (is_gimple_call (def)
4841 && gimple_call_internal_p (def)
4842 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4844 tree arg = gimple_call_arg (def, 0);
4845 tree reft = TREE_TYPE (DR_REF (newdr));
4846 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4847 arg = SSA_NAME_VAR (arg);
4848 if (arg == loop->simduid
4849 /* For now. */
4850 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4852 DR_BASE_ADDRESS (newdr) = base_address;
4853 DR_OFFSET (newdr) = ssize_int (0);
4854 DR_STEP (newdr) = step;
4855 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4856 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4857 /* Mark as simd-lane access. */
4858 tree arg2 = gimple_call_arg (def, 1);
4859 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4860 free_data_ref (dr);
4861 datarefs->safe_push (newdr);
4862 if (dataref_groups)
4863 dataref_groups->safe_push (group_id);
4864 return opt_result::success ();
4869 free_data_ref (newdr);
4872 datarefs->safe_push (dr);
4873 if (dataref_groups)
4874 dataref_groups->safe_push (group_id);
4875 return opt_result::success ();
4878 /* Function vect_analyze_data_refs.
4880 Find all the data references in the loop or basic block.
4882 The general structure of the analysis of data refs in the vectorizer is as
4883 follows:
4884 1- vect_analyze_data_refs(loop/bb): call
4885 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4886 in the loop/bb and their dependences.
4887 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4888 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4889 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4893 opt_result
4894 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4896 class loop *loop = NULL;
4897 unsigned int i;
4898 struct data_reference *dr;
4899 tree scalar_type;
4901 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4903 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4904 loop = LOOP_VINFO_LOOP (loop_vinfo);
4906 /* Go through the data-refs, check that the analysis succeeded. Update
4907 pointer from stmt_vec_info struct to DR and vectype. */
4909 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4910 FOR_EACH_VEC_ELT (datarefs, i, dr)
4912 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4913 poly_uint64 vf;
4915 gcc_assert (DR_REF (dr));
4916 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4917 gcc_assert (!stmt_info->dr_aux.dr);
4918 stmt_info->dr_aux.dr = dr;
4919 stmt_info->dr_aux.stmt = stmt_info;
4921 /* Check that analysis of the data-ref succeeded. */
4922 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4923 || !DR_STEP (dr))
4925 bool maybe_gather
4926 = DR_IS_READ (dr)
4927 && !TREE_THIS_VOLATILE (DR_REF (dr));
4928 bool maybe_scatter
4929 = DR_IS_WRITE (dr)
4930 && !TREE_THIS_VOLATILE (DR_REF (dr));
4932 /* If target supports vector gather loads or scatter stores,
4933 see if they can't be used. */
4934 if (is_a <loop_vec_info> (vinfo)
4935 && !nested_in_vect_loop_p (loop, stmt_info))
4937 if (maybe_gather || maybe_scatter)
4939 if (maybe_gather)
4940 gatherscatter = GATHER;
4941 else
4942 gatherscatter = SCATTER;
4946 if (gatherscatter == SG_NONE)
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950 "not vectorized: data ref analysis "
4951 "failed %G", stmt_info->stmt);
4952 if (is_a <bb_vec_info> (vinfo))
4954 /* In BB vectorization the ref can still participate
4955 in dependence analysis, we just can't vectorize it. */
4956 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4957 continue;
4959 return opt_result::failure_at (stmt_info->stmt,
4960 "not vectorized:"
4961 " data ref analysis failed: %G",
4962 stmt_info->stmt);
4966 /* See if this was detected as SIMD lane access. */
4967 if (dr->aux == (void *)-1
4968 || dr->aux == (void *)-2
4969 || dr->aux == (void *)-3
4970 || dr->aux == (void *)-4)
4972 if (nested_in_vect_loop_p (loop, stmt_info))
4973 return opt_result::failure_at (stmt_info->stmt,
4974 "not vectorized:"
4975 " data ref analysis failed: %G",
4976 stmt_info->stmt);
4977 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4978 = -(uintptr_t) dr->aux;
4981 tree base = get_base_address (DR_REF (dr));
4982 if (base && VAR_P (base) && DECL_NONALIASED (base))
4984 if (dump_enabled_p ())
4985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4986 "not vectorized: base object not addressable "
4987 "for stmt: %G", stmt_info->stmt);
4988 if (is_a <bb_vec_info> (vinfo))
4990 /* In BB vectorization the ref can still participate
4991 in dependence analysis, we just can't vectorize it. */
4992 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4993 continue;
4995 return opt_result::failure_at (stmt_info->stmt,
4996 "not vectorized: base object not"
4997 " addressable for stmt: %G",
4998 stmt_info->stmt);
5001 if (is_a <loop_vec_info> (vinfo)
5002 && DR_STEP (dr)
5003 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
5005 if (nested_in_vect_loop_p (loop, stmt_info))
5006 return opt_result::failure_at (stmt_info->stmt,
5007 "not vectorized: "
5008 "not suitable for strided load %G",
5009 stmt_info->stmt);
5010 STMT_VINFO_STRIDED_P (stmt_info) = true;
5013 /* Update DR field in stmt_vec_info struct. */
5015 /* If the dataref is in an inner-loop of the loop that is considered for
5016 for vectorization, we also want to analyze the access relative to
5017 the outer-loop (DR contains information only relative to the
5018 inner-most enclosing loop). We do that by building a reference to the
5019 first location accessed by the inner-loop, and analyze it relative to
5020 the outer-loop. */
5021 if (loop && nested_in_vect_loop_p (loop, stmt_info))
5023 /* Build a reference to the first location accessed by the
5024 inner loop: *(BASE + INIT + OFFSET). By construction,
5025 this address must be invariant in the inner loop, so we
5026 can consider it as being used in the outer loop. */
5027 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
5028 tree offset = unshare_expr (DR_OFFSET (dr));
5029 tree init = unshare_expr (DR_INIT (dr));
5030 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
5031 init, offset);
5032 tree init_addr = fold_build_pointer_plus (base, init_offset);
5033 tree init_ref = build_fold_indirect_ref (init_addr);
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_NOTE, vect_location,
5037 "analyze in outer loop: %T\n", init_ref);
5039 opt_result res
5040 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
5041 init_ref, loop, stmt_info->stmt);
5042 if (!res)
5043 /* dr_analyze_innermost already explained the failure. */
5044 return res;
5046 if (dump_enabled_p ())
5047 dump_printf_loc (MSG_NOTE, vect_location,
5048 "\touter base_address: %T\n"
5049 "\touter offset from base address: %T\n"
5050 "\touter constant offset from base address: %T\n"
5051 "\touter step: %T\n"
5052 "\touter base alignment: %d\n\n"
5053 "\touter base misalignment: %d\n"
5054 "\touter offset alignment: %d\n"
5055 "\touter step alignment: %d\n",
5056 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
5057 STMT_VINFO_DR_OFFSET (stmt_info),
5058 STMT_VINFO_DR_INIT (stmt_info),
5059 STMT_VINFO_DR_STEP (stmt_info),
5060 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
5061 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
5062 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
5063 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
5066 /* Set vectype for STMT. */
5067 scalar_type = TREE_TYPE (DR_REF (dr));
5068 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
5069 if (!vectype)
5071 if (dump_enabled_p ())
5073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5074 "not vectorized: no vectype for stmt: %G",
5075 stmt_info->stmt);
5076 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
5077 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
5078 scalar_type);
5079 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5082 if (is_a <bb_vec_info> (vinfo))
5084 /* No vector type is fine, the ref can still participate
5085 in dependence analysis, we just can't vectorize it. */
5086 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5087 continue;
5089 if (fatal)
5090 *fatal = false;
5091 return opt_result::failure_at (stmt_info->stmt,
5092 "not vectorized:"
5093 " no vectype for stmt: %G"
5094 " scalar_type: %T\n",
5095 stmt_info->stmt, scalar_type);
5097 else
5099 if (dump_enabled_p ())
5100 dump_printf_loc (MSG_NOTE, vect_location,
5101 "got vectype for stmt: %G%T\n",
5102 stmt_info->stmt, vectype);
5105 /* Adjust the minimal vectorization factor according to the
5106 vector type. */
5107 vf = TYPE_VECTOR_SUBPARTS (vectype);
5108 *min_vf = upper_bound (*min_vf, vf);
5110 /* Leave the BB vectorizer to pick the vector type later, based on
5111 the final dataref group size and SLP node size. */
5112 if (is_a <loop_vec_info> (vinfo))
5113 STMT_VINFO_VECTYPE (stmt_info) = vectype;
5115 if (gatherscatter != SG_NONE)
5117 gather_scatter_info gs_info;
5118 if (!vect_check_gather_scatter (stmt_info,
5119 as_a <loop_vec_info> (vinfo),
5120 &gs_info)
5121 || !get_vectype_for_scalar_type (vinfo,
5122 TREE_TYPE (gs_info.offset)))
5124 if (fatal)
5125 *fatal = false;
5126 return opt_result::failure_at
5127 (stmt_info->stmt,
5128 (gatherscatter == GATHER)
5129 ? "not vectorized: not suitable for gather load %G"
5130 : "not vectorized: not suitable for scatter store %G",
5131 stmt_info->stmt);
5133 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5137 /* We used to stop processing and prune the list here. Verify we no
5138 longer need to. */
5139 gcc_assert (i == datarefs.length ());
5141 return opt_result::success ();
5145 /* Function vect_get_new_vect_var.
5147 Returns a name for a new variable. The current naming scheme appends the
5148 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5149 the name of vectorizer generated variables, and appends that to NAME if
5150 provided. */
5152 tree
5153 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5155 const char *prefix;
5156 tree new_vect_var;
5158 switch (var_kind)
5160 case vect_simple_var:
5161 prefix = "vect";
5162 break;
5163 case vect_scalar_var:
5164 prefix = "stmp";
5165 break;
5166 case vect_mask_var:
5167 prefix = "mask";
5168 break;
5169 case vect_pointer_var:
5170 prefix = "vectp";
5171 break;
5172 default:
5173 gcc_unreachable ();
5176 if (name)
5178 char* tmp = concat (prefix, "_", name, NULL);
5179 new_vect_var = create_tmp_reg (type, tmp);
5180 free (tmp);
5182 else
5183 new_vect_var = create_tmp_reg (type, prefix);
5185 return new_vect_var;
5188 /* Like vect_get_new_vect_var but return an SSA name. */
5190 tree
5191 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5193 const char *prefix;
5194 tree new_vect_var;
5196 switch (var_kind)
5198 case vect_simple_var:
5199 prefix = "vect";
5200 break;
5201 case vect_scalar_var:
5202 prefix = "stmp";
5203 break;
5204 case vect_pointer_var:
5205 prefix = "vectp";
5206 break;
5207 default:
5208 gcc_unreachable ();
5211 if (name)
5213 char* tmp = concat (prefix, "_", name, NULL);
5214 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5215 free (tmp);
5217 else
5218 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5220 return new_vect_var;
5223 /* Duplicate points-to info on NAME from DR_INFO. */
5225 static void
5226 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5228 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5229 /* DR_PTR_INFO is for a base SSA name, not including constant or
5230 variable offsets in the ref so its alignment info does not apply. */
5231 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5234 /* Function vect_create_addr_base_for_vector_ref.
5236 Create an expression that computes the address of the first memory location
5237 that will be accessed for a data reference.
5239 Input:
5240 STMT_INFO: The statement containing the data reference.
5241 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5242 OFFSET: Optional. If supplied, it is be added to the initial address.
5243 LOOP: Specify relative to which loop-nest should the address be computed.
5244 For example, when the dataref is in an inner-loop nested in an
5245 outer-loop that is now being vectorized, LOOP can be either the
5246 outer-loop, or the inner-loop. The first memory location accessed
5247 by the following dataref ('in' points to short):
5249 for (i=0; i<N; i++)
5250 for (j=0; j<M; j++)
5251 s += in[i+j]
5253 is as follows:
5254 if LOOP=i_loop: &in (relative to i_loop)
5255 if LOOP=j_loop: &in+i*2B (relative to j_loop)
5257 Output:
5258 1. Return an SSA_NAME whose value is the address of the memory location of
5259 the first vector of the data reference.
5260 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5261 these statement(s) which define the returned SSA_NAME.
5263 FORNOW: We are only handling array accesses with step 1. */
5265 tree
5266 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5267 gimple_seq *new_stmt_list,
5268 tree offset)
5270 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5271 struct data_reference *dr = dr_info->dr;
5272 const char *base_name;
5273 tree addr_base;
5274 tree dest;
5275 gimple_seq seq = NULL;
5276 tree vect_ptr_type;
5277 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5278 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5280 tree data_ref_base = unshare_expr (drb->base_address);
5281 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5282 tree init = unshare_expr (drb->init);
5284 if (loop_vinfo)
5285 base_name = get_name (data_ref_base);
5286 else
5288 base_offset = ssize_int (0);
5289 init = ssize_int (0);
5290 base_name = get_name (DR_REF (dr));
5293 /* Create base_offset */
5294 base_offset = size_binop (PLUS_EXPR,
5295 fold_convert (sizetype, base_offset),
5296 fold_convert (sizetype, init));
5298 if (offset)
5300 offset = fold_convert (sizetype, offset);
5301 base_offset = fold_build2 (PLUS_EXPR, sizetype,
5302 base_offset, offset);
5305 /* base + base_offset */
5306 if (loop_vinfo)
5307 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5308 else
5309 addr_base = build1 (ADDR_EXPR,
5310 build_pointer_type (TREE_TYPE (DR_REF (dr))),
5311 /* Strip zero offset components since we don't need
5312 them and they can confuse late diagnostics if
5313 we CSE them wrongly. See PR106904 for example. */
5314 unshare_expr (strip_zero_offset_components
5315 (DR_REF (dr))));
5317 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5318 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5319 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5320 gimple_seq_add_seq (new_stmt_list, seq);
5322 if (DR_PTR_INFO (dr)
5323 && TREE_CODE (addr_base) == SSA_NAME
5324 /* We should only duplicate pointer info to newly created SSA names. */
5325 && SSA_NAME_VAR (addr_base) == dest)
5327 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5328 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5331 if (dump_enabled_p ())
5332 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5334 return addr_base;
5338 /* Function vect_create_data_ref_ptr.
5340 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5341 location accessed in the loop by STMT_INFO, along with the def-use update
5342 chain to appropriately advance the pointer through the loop iterations.
5343 Also set aliasing information for the pointer. This pointer is used by
5344 the callers to this function to create a memory reference expression for
5345 vector load/store access.
5347 Input:
5348 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5349 GIMPLE_ASSIGN <name, data-ref> or
5350 GIMPLE_ASSIGN <data-ref, name>.
5351 2. AGGR_TYPE: the type of the reference, which should be either a vector
5352 or an array.
5353 3. AT_LOOP: the loop where the vector memref is to be created.
5354 4. OFFSET (optional): a byte offset to be added to the initial address
5355 accessed by the data-ref in STMT_INFO.
5356 5. BSI: location where the new stmts are to be placed if there is no loop
5357 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5358 pointing to the initial address.
5359 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5360 to the IV during each iteration of the loop. NULL says to move
5361 by one copy of AGGR_TYPE up or down, depending on the step of the
5362 data reference.
5364 Output:
5365 1. Declare a new ptr to vector_type, and have it point to the base of the
5366 data reference (initial addressed accessed by the data reference).
5367 For example, for vector of type V8HI, the following code is generated:
5369 v8hi *ap;
5370 ap = (v8hi *)initial_address;
5372 if OFFSET is not supplied:
5373 initial_address = &a[init];
5374 if OFFSET is supplied:
5375 initial_address = &a[init] + OFFSET;
5376 if BYTE_OFFSET is supplied:
5377 initial_address = &a[init] + BYTE_OFFSET;
5379 Return the initial_address in INITIAL_ADDRESS.
5381 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5382 update the pointer in each iteration of the loop.
5384 Return the increment stmt that updates the pointer in PTR_INCR.
5386 3. Return the pointer. */
5388 tree
5389 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5390 tree aggr_type, class loop *at_loop, tree offset,
5391 tree *initial_address, gimple_stmt_iterator *gsi,
5392 gimple **ptr_incr, bool only_init,
5393 tree iv_step)
5395 const char *base_name;
5396 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5397 class loop *loop = NULL;
5398 bool nested_in_vect_loop = false;
5399 class loop *containing_loop = NULL;
5400 tree aggr_ptr_type;
5401 tree aggr_ptr;
5402 tree new_temp;
5403 gimple_seq new_stmt_list = NULL;
5404 edge pe = NULL;
5405 basic_block new_bb;
5406 tree aggr_ptr_init;
5407 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5408 struct data_reference *dr = dr_info->dr;
5409 tree aptr;
5410 gimple_stmt_iterator incr_gsi;
5411 bool insert_after;
5412 tree indx_before_incr, indx_after_incr;
5413 gimple *incr;
5414 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5416 gcc_assert (iv_step != NULL_TREE
5417 || TREE_CODE (aggr_type) == ARRAY_TYPE
5418 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5420 if (loop_vinfo)
5422 loop = LOOP_VINFO_LOOP (loop_vinfo);
5423 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5424 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5425 pe = loop_preheader_edge (loop);
5427 else
5429 gcc_assert (bb_vinfo);
5430 only_init = true;
5431 *ptr_incr = NULL;
5434 /* Create an expression for the first address accessed by this load
5435 in LOOP. */
5436 base_name = get_name (DR_BASE_ADDRESS (dr));
5438 if (dump_enabled_p ())
5440 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5441 dump_printf_loc (MSG_NOTE, vect_location,
5442 "create %s-pointer variable to type: %T",
5443 get_tree_code_name (TREE_CODE (aggr_type)),
5444 aggr_type);
5445 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5446 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5447 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5448 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5449 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5450 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5451 else
5452 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5453 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5456 /* (1) Create the new aggregate-pointer variable.
5457 Vector and array types inherit the alias set of their component
5458 type by default so we need to use a ref-all pointer if the data
5459 reference does not conflict with the created aggregated data
5460 reference because it is not addressable. */
5461 bool need_ref_all = false;
5462 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5463 get_alias_set (DR_REF (dr))))
5464 need_ref_all = true;
5465 /* Likewise for any of the data references in the stmt group. */
5466 else if (DR_GROUP_SIZE (stmt_info) > 1)
5468 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5471 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5472 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5473 get_alias_set (DR_REF (sdr))))
5475 need_ref_all = true;
5476 break;
5478 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5480 while (sinfo);
5482 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5483 need_ref_all);
5484 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5487 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5488 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5489 def-use update cycles for the pointer: one relative to the outer-loop
5490 (LOOP), which is what steps (3) and (4) below do. The other is relative
5491 to the inner-loop (which is the inner-most loop containing the dataref),
5492 and this is done be step (5) below.
5494 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5495 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5496 redundant. Steps (3),(4) create the following:
5498 vp0 = &base_addr;
5499 LOOP: vp1 = phi(vp0,vp2)
5502 vp2 = vp1 + step
5503 goto LOOP
5505 If there is an inner-loop nested in loop, then step (5) will also be
5506 applied, and an additional update in the inner-loop will be created:
5508 vp0 = &base_addr;
5509 LOOP: vp1 = phi(vp0,vp2)
5511 inner: vp3 = phi(vp1,vp4)
5512 vp4 = vp3 + inner_step
5513 if () goto inner
5515 vp2 = vp1 + step
5516 if () goto LOOP */
5518 /* (2) Calculate the initial address of the aggregate-pointer, and set
5519 the aggregate-pointer to point to it before the loop. */
5521 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5523 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5524 stmt_info, &new_stmt_list,
5525 offset);
5526 if (new_stmt_list)
5528 if (pe)
5530 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5531 gcc_assert (!new_bb);
5533 else
5534 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5537 *initial_address = new_temp;
5538 aggr_ptr_init = new_temp;
5540 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5541 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5542 inner-loop nested in LOOP (during outer-loop vectorization). */
5544 /* No update in loop is required. */
5545 if (only_init && (!loop_vinfo || at_loop == loop))
5546 aptr = aggr_ptr_init;
5547 else
5549 /* Accesses to invariant addresses should be handled specially
5550 by the caller. */
5551 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5552 gcc_assert (!integer_zerop (step));
5554 if (iv_step == NULL_TREE)
5556 /* The step of the aggregate pointer is the type size,
5557 negated for downward accesses. */
5558 iv_step = TYPE_SIZE_UNIT (aggr_type);
5559 if (tree_int_cst_sgn (step) == -1)
5560 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5563 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5565 create_iv (aggr_ptr_init, PLUS_EXPR,
5566 fold_convert (aggr_ptr_type, iv_step),
5567 aggr_ptr, loop, &incr_gsi, insert_after,
5568 &indx_before_incr, &indx_after_incr);
5569 incr = gsi_stmt (incr_gsi);
5571 /* Copy the points-to information if it exists. */
5572 if (DR_PTR_INFO (dr))
5574 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5575 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5577 if (ptr_incr)
5578 *ptr_incr = incr;
5580 aptr = indx_before_incr;
5583 if (!nested_in_vect_loop || only_init)
5584 return aptr;
5587 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5588 nested in LOOP, if exists. */
5590 gcc_assert (nested_in_vect_loop);
5591 if (!only_init)
5593 standard_iv_increment_position (containing_loop, &incr_gsi,
5594 &insert_after);
5595 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5596 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5597 &indx_before_incr, &indx_after_incr);
5598 incr = gsi_stmt (incr_gsi);
5600 /* Copy the points-to information if it exists. */
5601 if (DR_PTR_INFO (dr))
5603 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5604 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5606 if (ptr_incr)
5607 *ptr_incr = incr;
5609 return indx_before_incr;
5611 else
5612 gcc_unreachable ();
5616 /* Function bump_vector_ptr
5618 Increment a pointer (to a vector type) by vector-size. If requested,
5619 i.e. if PTR-INCR is given, then also connect the new increment stmt
5620 to the existing def-use update-chain of the pointer, by modifying
5621 the PTR_INCR as illustrated below:
5623 The pointer def-use update-chain before this function:
5624 DATAREF_PTR = phi (p_0, p_2)
5625 ....
5626 PTR_INCR: p_2 = DATAREF_PTR + step
5628 The pointer def-use update-chain after this function:
5629 DATAREF_PTR = phi (p_0, p_2)
5630 ....
5631 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5632 ....
5633 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5635 Input:
5636 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5637 in the loop.
5638 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5639 the loop. The increment amount across iterations is expected
5640 to be vector_size.
5641 BSI - location where the new update stmt is to be placed.
5642 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5643 BUMP - optional. The offset by which to bump the pointer. If not given,
5644 the offset is assumed to be vector_size.
5646 Output: Return NEW_DATAREF_PTR as illustrated above.
5650 tree
5651 bump_vector_ptr (vec_info *vinfo,
5652 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5653 stmt_vec_info stmt_info, tree bump)
5655 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5656 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5657 tree update = TYPE_SIZE_UNIT (vectype);
5658 gimple *incr_stmt;
5659 ssa_op_iter iter;
5660 use_operand_p use_p;
5661 tree new_dataref_ptr;
5663 if (bump)
5664 update = bump;
5666 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5667 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5668 else if (is_gimple_min_invariant (dataref_ptr))
5669 /* When possible avoid emitting a separate increment stmt that will
5670 force the addressed object addressable. */
5671 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5672 fold_build2 (MEM_REF,
5673 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5674 dataref_ptr,
5675 fold_convert (ptr_type_node, update)));
5676 else
5677 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5678 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5679 dataref_ptr, update);
5680 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5681 /* Fold the increment, avoiding excessive chains use-def chains of
5682 those, leading to compile-time issues for passes until the next
5683 forwprop pass which would do this as well. */
5684 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5685 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5687 incr_stmt = gsi_stmt (fold_gsi);
5688 update_stmt (incr_stmt);
5691 /* Copy the points-to information if it exists. */
5692 if (DR_PTR_INFO (dr))
5694 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5695 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5698 if (!ptr_incr)
5699 return new_dataref_ptr;
5701 /* Update the vector-pointer's cross-iteration increment. */
5702 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5704 tree use = USE_FROM_PTR (use_p);
5706 if (use == dataref_ptr)
5707 SET_USE (use_p, new_dataref_ptr);
5708 else
5709 gcc_assert (operand_equal_p (use, update, 0));
5712 return new_dataref_ptr;
5716 /* Copy memory reference info such as base/clique from the SRC reference
5717 to the DEST MEM_REF. */
5719 void
5720 vect_copy_ref_info (tree dest, tree src)
5722 if (TREE_CODE (dest) != MEM_REF)
5723 return;
5725 tree src_base = src;
5726 while (handled_component_p (src_base))
5727 src_base = TREE_OPERAND (src_base, 0);
5728 if (TREE_CODE (src_base) != MEM_REF
5729 && TREE_CODE (src_base) != TARGET_MEM_REF)
5730 return;
5732 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5733 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5737 /* Function vect_create_destination_var.
5739 Create a new temporary of type VECTYPE. */
5741 tree
5742 vect_create_destination_var (tree scalar_dest, tree vectype)
5744 tree vec_dest;
5745 const char *name;
5746 char *new_name;
5747 tree type;
5748 enum vect_var_kind kind;
5750 kind = vectype
5751 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5752 ? vect_mask_var
5753 : vect_simple_var
5754 : vect_scalar_var;
5755 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5757 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5759 name = get_name (scalar_dest);
5760 if (name)
5761 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5762 else
5763 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5764 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5765 free (new_name);
5767 return vec_dest;
5770 /* Function vect_grouped_store_supported.
5772 Returns TRUE if interleave high and interleave low permutations
5773 are supported, and FALSE otherwise. */
5775 bool
5776 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5778 machine_mode mode = TYPE_MODE (vectype);
5780 /* vect_permute_store_chain requires the group size to be equal to 3 or
5781 be a power of two. */
5782 if (count != 3 && exact_log2 (count) == -1)
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786 "the size of the group of accesses"
5787 " is not a power of 2 or not eqaul to 3\n");
5788 return false;
5791 /* Check that the permutation is supported. */
5792 if (VECTOR_MODE_P (mode))
5794 unsigned int i;
5795 if (count == 3)
5797 unsigned int j0 = 0, j1 = 0, j2 = 0;
5798 unsigned int i, j;
5800 unsigned int nelt;
5801 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "cannot handle groups of 3 stores for"
5806 " variable-length vectors\n");
5807 return false;
5810 vec_perm_builder sel (nelt, nelt, 1);
5811 sel.quick_grow (nelt);
5812 vec_perm_indices indices;
5813 for (j = 0; j < 3; j++)
5815 int nelt0 = ((3 - j) * nelt) % 3;
5816 int nelt1 = ((3 - j) * nelt + 1) % 3;
5817 int nelt2 = ((3 - j) * nelt + 2) % 3;
5818 for (i = 0; i < nelt; i++)
5820 if (3 * i + nelt0 < nelt)
5821 sel[3 * i + nelt0] = j0++;
5822 if (3 * i + nelt1 < nelt)
5823 sel[3 * i + nelt1] = nelt + j1++;
5824 if (3 * i + nelt2 < nelt)
5825 sel[3 * i + nelt2] = 0;
5827 indices.new_vector (sel, 2, nelt);
5828 if (!can_vec_perm_const_p (mode, mode, indices))
5830 if (dump_enabled_p ())
5831 dump_printf (MSG_MISSED_OPTIMIZATION,
5832 "permutation op not supported by target.\n");
5833 return false;
5836 for (i = 0; i < nelt; i++)
5838 if (3 * i + nelt0 < nelt)
5839 sel[3 * i + nelt0] = 3 * i + nelt0;
5840 if (3 * i + nelt1 < nelt)
5841 sel[3 * i + nelt1] = 3 * i + nelt1;
5842 if (3 * i + nelt2 < nelt)
5843 sel[3 * i + nelt2] = nelt + j2++;
5845 indices.new_vector (sel, 2, nelt);
5846 if (!can_vec_perm_const_p (mode, mode, indices))
5848 if (dump_enabled_p ())
5849 dump_printf (MSG_MISSED_OPTIMIZATION,
5850 "permutation op not supported by target.\n");
5851 return false;
5854 return true;
5856 else
5858 /* If length is not equal to 3 then only power of 2 is supported. */
5859 gcc_assert (pow2p_hwi (count));
5860 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5862 /* The encoding has 2 interleaved stepped patterns. */
5863 if(!multiple_p (nelt, 2))
5864 return false;
5865 vec_perm_builder sel (nelt, 2, 3);
5866 sel.quick_grow (6);
5867 for (i = 0; i < 3; i++)
5869 sel[i * 2] = i;
5870 sel[i * 2 + 1] = i + nelt;
5872 vec_perm_indices indices (sel, 2, nelt);
5873 if (can_vec_perm_const_p (mode, mode, indices))
5875 for (i = 0; i < 6; i++)
5876 sel[i] += exact_div (nelt, 2);
5877 indices.new_vector (sel, 2, nelt);
5878 if (can_vec_perm_const_p (mode, mode, indices))
5879 return true;
5884 if (dump_enabled_p ())
5885 dump_printf (MSG_MISSED_OPTIMIZATION,
5886 "permutation op not supported by target.\n");
5887 return false;
5890 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5891 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5893 internal_fn
5894 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5895 bool masked_p)
5897 if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5898 vec_mask_len_store_lanes_optab, vectype,
5899 count))
5900 return IFN_MASK_LEN_STORE_LANES;
5901 else if (masked_p)
5903 if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5904 vec_mask_store_lanes_optab, vectype,
5905 count))
5906 return IFN_MASK_STORE_LANES;
5908 else
5910 if (vect_lanes_optab_supported_p ("vec_store_lanes",
5911 vec_store_lanes_optab, vectype, count))
5912 return IFN_STORE_LANES;
5914 return IFN_LAST;
5918 /* Function vect_permute_store_chain.
5920 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5921 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5922 the data correctly for the stores. Return the final references for stores
5923 in RESULT_CHAIN.
5925 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5926 The input is 4 vectors each containing 8 elements. We assign a number to
5927 each element, the input sequence is:
5929 1st vec: 0 1 2 3 4 5 6 7
5930 2nd vec: 8 9 10 11 12 13 14 15
5931 3rd vec: 16 17 18 19 20 21 22 23
5932 4th vec: 24 25 26 27 28 29 30 31
5934 The output sequence should be:
5936 1st vec: 0 8 16 24 1 9 17 25
5937 2nd vec: 2 10 18 26 3 11 19 27
5938 3rd vec: 4 12 20 28 5 13 21 30
5939 4th vec: 6 14 22 30 7 15 23 31
5941 i.e., we interleave the contents of the four vectors in their order.
5943 We use interleave_high/low instructions to create such output. The input of
5944 each interleave_high/low operation is two vectors:
5945 1st vec 2nd vec
5946 0 1 2 3 4 5 6 7
5947 the even elements of the result vector are obtained left-to-right from the
5948 high/low elements of the first vector. The odd elements of the result are
5949 obtained left-to-right from the high/low elements of the second vector.
5950 The output of interleave_high will be: 0 4 1 5
5951 and of interleave_low: 2 6 3 7
5954 The permutation is done in log LENGTH stages. In each stage interleave_high
5955 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5956 where the first argument is taken from the first half of DR_CHAIN and the
5957 second argument from it's second half.
5958 In our example,
5960 I1: interleave_high (1st vec, 3rd vec)
5961 I2: interleave_low (1st vec, 3rd vec)
5962 I3: interleave_high (2nd vec, 4th vec)
5963 I4: interleave_low (2nd vec, 4th vec)
5965 The output for the first stage is:
5967 I1: 0 16 1 17 2 18 3 19
5968 I2: 4 20 5 21 6 22 7 23
5969 I3: 8 24 9 25 10 26 11 27
5970 I4: 12 28 13 29 14 30 15 31
5972 The output of the second stage, i.e. the final result is:
5974 I1: 0 8 16 24 1 9 17 25
5975 I2: 2 10 18 26 3 11 19 27
5976 I3: 4 12 20 28 5 13 21 30
5977 I4: 6 14 22 30 7 15 23 31. */
5979 void
5980 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5981 unsigned int length,
5982 stmt_vec_info stmt_info,
5983 gimple_stmt_iterator *gsi,
5984 vec<tree> *result_chain)
5986 tree vect1, vect2, high, low;
5987 gimple *perm_stmt;
5988 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5989 tree perm_mask_low, perm_mask_high;
5990 tree data_ref;
5991 tree perm3_mask_low, perm3_mask_high;
5992 unsigned int i, j, n, log_length = exact_log2 (length);
5994 result_chain->quick_grow (length);
5995 memcpy (result_chain->address (), dr_chain.address (),
5996 length * sizeof (tree));
5998 if (length == 3)
6000 /* vect_grouped_store_supported ensures that this is constant. */
6001 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6002 unsigned int j0 = 0, j1 = 0, j2 = 0;
6004 vec_perm_builder sel (nelt, nelt, 1);
6005 sel.quick_grow (nelt);
6006 vec_perm_indices indices;
6007 for (j = 0; j < 3; j++)
6009 int nelt0 = ((3 - j) * nelt) % 3;
6010 int nelt1 = ((3 - j) * nelt + 1) % 3;
6011 int nelt2 = ((3 - j) * nelt + 2) % 3;
6013 for (i = 0; i < nelt; i++)
6015 if (3 * i + nelt0 < nelt)
6016 sel[3 * i + nelt0] = j0++;
6017 if (3 * i + nelt1 < nelt)
6018 sel[3 * i + nelt1] = nelt + j1++;
6019 if (3 * i + nelt2 < nelt)
6020 sel[3 * i + nelt2] = 0;
6022 indices.new_vector (sel, 2, nelt);
6023 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6025 for (i = 0; i < nelt; i++)
6027 if (3 * i + nelt0 < nelt)
6028 sel[3 * i + nelt0] = 3 * i + nelt0;
6029 if (3 * i + nelt1 < nelt)
6030 sel[3 * i + nelt1] = 3 * i + nelt1;
6031 if (3 * i + nelt2 < nelt)
6032 sel[3 * i + nelt2] = nelt + j2++;
6034 indices.new_vector (sel, 2, nelt);
6035 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6037 vect1 = dr_chain[0];
6038 vect2 = dr_chain[1];
6040 /* Create interleaving stmt:
6041 low = VEC_PERM_EXPR <vect1, vect2,
6042 {j, nelt, *, j + 1, nelt + j + 1, *,
6043 j + 2, nelt + j + 2, *, ...}> */
6044 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6045 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6046 vect2, perm3_mask_low);
6047 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6049 vect1 = data_ref;
6050 vect2 = dr_chain[2];
6051 /* Create interleaving stmt:
6052 low = VEC_PERM_EXPR <vect1, vect2,
6053 {0, 1, nelt + j, 3, 4, nelt + j + 1,
6054 6, 7, nelt + j + 2, ...}> */
6055 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6056 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6057 vect2, perm3_mask_high);
6058 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6059 (*result_chain)[j] = data_ref;
6062 else
6064 /* If length is not equal to 3 then only power of 2 is supported. */
6065 gcc_assert (pow2p_hwi (length));
6067 /* The encoding has 2 interleaved stepped patterns. */
6068 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6069 vec_perm_builder sel (nelt, 2, 3);
6070 sel.quick_grow (6);
6071 for (i = 0; i < 3; i++)
6073 sel[i * 2] = i;
6074 sel[i * 2 + 1] = i + nelt;
6076 vec_perm_indices indices (sel, 2, nelt);
6077 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6079 for (i = 0; i < 6; i++)
6080 sel[i] += exact_div (nelt, 2);
6081 indices.new_vector (sel, 2, nelt);
6082 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6084 for (i = 0, n = log_length; i < n; i++)
6086 for (j = 0; j < length/2; j++)
6088 vect1 = dr_chain[j];
6089 vect2 = dr_chain[j+length/2];
6091 /* Create interleaving stmt:
6092 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
6093 ...}> */
6094 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
6095 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
6096 vect2, perm_mask_high);
6097 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6098 (*result_chain)[2*j] = high;
6100 /* Create interleaving stmt:
6101 low = VEC_PERM_EXPR <vect1, vect2,
6102 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
6103 ...}> */
6104 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
6105 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
6106 vect2, perm_mask_low);
6107 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6108 (*result_chain)[2*j+1] = low;
6110 memcpy (dr_chain.address (), result_chain->address (),
6111 length * sizeof (tree));
6116 /* Function vect_setup_realignment
6118 This function is called when vectorizing an unaligned load using
6119 the dr_explicit_realign[_optimized] scheme.
6120 This function generates the following code at the loop prolog:
6122 p = initial_addr;
6123 x msq_init = *(floor(p)); # prolog load
6124 realignment_token = call target_builtin;
6125 loop:
6126 x msq = phi (msq_init, ---)
6128 The stmts marked with x are generated only for the case of
6129 dr_explicit_realign_optimized.
6131 The code above sets up a new (vector) pointer, pointing to the first
6132 location accessed by STMT_INFO, and a "floor-aligned" load using that
6133 pointer. It also generates code to compute the "realignment-token"
6134 (if the relevant target hook was defined), and creates a phi-node at the
6135 loop-header bb whose arguments are the result of the prolog-load (created
6136 by this function) and the result of a load that takes place in the loop
6137 (to be created by the caller to this function).
6139 For the case of dr_explicit_realign_optimized:
6140 The caller to this function uses the phi-result (msq) to create the
6141 realignment code inside the loop, and sets up the missing phi argument,
6142 as follows:
6143 loop:
6144 msq = phi (msq_init, lsq)
6145 lsq = *(floor(p')); # load in loop
6146 result = realign_load (msq, lsq, realignment_token);
6148 For the case of dr_explicit_realign:
6149 loop:
6150 msq = *(floor(p)); # load in loop
6151 p' = p + (VS-1);
6152 lsq = *(floor(p')); # load in loop
6153 result = realign_load (msq, lsq, realignment_token);
6155 Input:
6156 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6157 a memory location that may be unaligned.
6158 BSI - place where new code is to be inserted.
6159 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6160 is used.
6162 Output:
6163 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6164 target hook, if defined.
6165 Return value - the result of the loop-header phi node. */
6167 tree
6168 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6169 gimple_stmt_iterator *gsi, tree *realignment_token,
6170 enum dr_alignment_support alignment_support_scheme,
6171 tree init_addr,
6172 class loop **at_loop)
6174 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6175 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6176 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6177 struct data_reference *dr = dr_info->dr;
6178 class loop *loop = NULL;
6179 edge pe = NULL;
6180 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6181 tree vec_dest;
6182 gimple *inc;
6183 tree ptr;
6184 tree data_ref;
6185 basic_block new_bb;
6186 tree msq_init = NULL_TREE;
6187 tree new_temp;
6188 gphi *phi_stmt;
6189 tree msq = NULL_TREE;
6190 gimple_seq stmts = NULL;
6191 bool compute_in_loop = false;
6192 bool nested_in_vect_loop = false;
6193 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6194 class loop *loop_for_initial_load = NULL;
6196 if (loop_vinfo)
6198 loop = LOOP_VINFO_LOOP (loop_vinfo);
6199 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6202 gcc_assert (alignment_support_scheme == dr_explicit_realign
6203 || alignment_support_scheme == dr_explicit_realign_optimized);
6205 /* We need to generate three things:
6206 1. the misalignment computation
6207 2. the extra vector load (for the optimized realignment scheme).
6208 3. the phi node for the two vectors from which the realignment is
6209 done (for the optimized realignment scheme). */
6211 /* 1. Determine where to generate the misalignment computation.
6213 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6214 calculation will be generated by this function, outside the loop (in the
6215 preheader). Otherwise, INIT_ADDR had already been computed for us by the
6216 caller, inside the loop.
6218 Background: If the misalignment remains fixed throughout the iterations of
6219 the loop, then both realignment schemes are applicable, and also the
6220 misalignment computation can be done outside LOOP. This is because we are
6221 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6222 are a multiple of VS (the Vector Size), and therefore the misalignment in
6223 different vectorized LOOP iterations is always the same.
6224 The problem arises only if the memory access is in an inner-loop nested
6225 inside LOOP, which is now being vectorized using outer-loop vectorization.
6226 This is the only case when the misalignment of the memory access may not
6227 remain fixed throughout the iterations of the inner-loop (as explained in
6228 detail in vect_supportable_dr_alignment). In this case, not only is the
6229 optimized realignment scheme not applicable, but also the misalignment
6230 computation (and generation of the realignment token that is passed to
6231 REALIGN_LOAD) have to be done inside the loop.
6233 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6234 or not, which in turn determines if the misalignment is computed inside
6235 the inner-loop, or outside LOOP. */
6237 if (init_addr != NULL_TREE || !loop_vinfo)
6239 compute_in_loop = true;
6240 gcc_assert (alignment_support_scheme == dr_explicit_realign);
6244 /* 2. Determine where to generate the extra vector load.
6246 For the optimized realignment scheme, instead of generating two vector
6247 loads in each iteration, we generate a single extra vector load in the
6248 preheader of the loop, and in each iteration reuse the result of the
6249 vector load from the previous iteration. In case the memory access is in
6250 an inner-loop nested inside LOOP, which is now being vectorized using
6251 outer-loop vectorization, we need to determine whether this initial vector
6252 load should be generated at the preheader of the inner-loop, or can be
6253 generated at the preheader of LOOP. If the memory access has no evolution
6254 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6255 to be generated inside LOOP (in the preheader of the inner-loop). */
6257 if (nested_in_vect_loop)
6259 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6260 bool invariant_in_outerloop =
6261 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6262 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6264 else
6265 loop_for_initial_load = loop;
6266 if (at_loop)
6267 *at_loop = loop_for_initial_load;
6269 tree vuse = NULL_TREE;
6270 if (loop_for_initial_load)
6272 pe = loop_preheader_edge (loop_for_initial_load);
6273 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6274 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6276 if (!vuse)
6277 vuse = gimple_vuse (gsi_stmt (*gsi));
6279 /* 3. For the case of the optimized realignment, create the first vector
6280 load at the loop preheader. */
6282 if (alignment_support_scheme == dr_explicit_realign_optimized)
6284 /* Create msq_init = *(floor(p1)) in the loop preheader */
6285 gassign *new_stmt;
6287 gcc_assert (!compute_in_loop);
6288 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6289 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6290 loop_for_initial_load, NULL_TREE,
6291 &init_addr, NULL, &inc, true);
6292 if (TREE_CODE (ptr) == SSA_NAME)
6293 new_temp = copy_ssa_name (ptr);
6294 else
6295 new_temp = make_ssa_name (TREE_TYPE (ptr));
6296 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6297 tree type = TREE_TYPE (ptr);
6298 new_stmt = gimple_build_assign
6299 (new_temp, BIT_AND_EXPR, ptr,
6300 fold_build2 (MINUS_EXPR, type,
6301 build_int_cst (type, 0),
6302 build_int_cst (type, align)));
6303 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6304 gcc_assert (!new_bb);
6305 data_ref
6306 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6307 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6308 vect_copy_ref_info (data_ref, DR_REF (dr));
6309 new_stmt = gimple_build_assign (vec_dest, data_ref);
6310 new_temp = make_ssa_name (vec_dest, new_stmt);
6311 gimple_assign_set_lhs (new_stmt, new_temp);
6312 gimple_set_vuse (new_stmt, vuse);
6313 if (pe)
6315 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6316 gcc_assert (!new_bb);
6318 else
6319 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6321 msq_init = gimple_assign_lhs (new_stmt);
6324 /* 4. Create realignment token using a target builtin, if available.
6325 It is done either inside the containing loop, or before LOOP (as
6326 determined above). */
6328 if (targetm.vectorize.builtin_mask_for_load)
6330 gcall *new_stmt;
6331 tree builtin_decl;
6333 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6334 if (!init_addr)
6336 /* Generate the INIT_ADDR computation outside LOOP. */
6337 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6338 stmt_info, &stmts,
6339 NULL_TREE);
6340 if (loop)
6342 pe = loop_preheader_edge (loop);
6343 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6344 gcc_assert (!new_bb);
6346 else
6347 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6350 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6351 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6352 vec_dest =
6353 vect_create_destination_var (scalar_dest,
6354 gimple_call_return_type (new_stmt));
6355 new_temp = make_ssa_name (vec_dest, new_stmt);
6356 gimple_call_set_lhs (new_stmt, new_temp);
6358 if (compute_in_loop)
6359 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6360 else
6362 /* Generate the misalignment computation outside LOOP. */
6363 pe = loop_preheader_edge (loop);
6364 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6365 gcc_assert (!new_bb);
6368 *realignment_token = gimple_call_lhs (new_stmt);
6370 /* The result of the CALL_EXPR to this builtin is determined from
6371 the value of the parameter and no global variables are touched
6372 which makes the builtin a "const" function. Requiring the
6373 builtin to have the "const" attribute makes it unnecessary
6374 to call mark_call_clobbered. */
6375 gcc_assert (TREE_READONLY (builtin_decl));
6378 if (alignment_support_scheme == dr_explicit_realign)
6379 return msq;
6381 gcc_assert (!compute_in_loop);
6382 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6385 /* 5. Create msq = phi <msq_init, lsq> in loop */
6387 pe = loop_preheader_edge (containing_loop);
6388 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6389 msq = make_ssa_name (vec_dest);
6390 phi_stmt = create_phi_node (msq, containing_loop->header);
6391 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6393 return msq;
6397 /* Function vect_grouped_load_supported.
6399 COUNT is the size of the load group (the number of statements plus the
6400 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6401 only one statement, with a gap of COUNT - 1.
6403 Returns true if a suitable permute exists. */
6405 bool
6406 vect_grouped_load_supported (tree vectype, bool single_element_p,
6407 unsigned HOST_WIDE_INT count)
6409 machine_mode mode = TYPE_MODE (vectype);
6411 /* If this is single-element interleaving with an element distance
6412 that leaves unused vector loads around punt - we at least create
6413 very sub-optimal code in that case (and blow up memory,
6414 see PR65518). */
6415 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6417 if (dump_enabled_p ())
6418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6419 "single-element interleaving not supported "
6420 "for not adjacent vector loads\n");
6421 return false;
6424 /* vect_permute_load_chain requires the group size to be equal to 3 or
6425 be a power of two. */
6426 if (count != 3 && exact_log2 (count) == -1)
6428 if (dump_enabled_p ())
6429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6430 "the size of the group of accesses"
6431 " is not a power of 2 or not equal to 3\n");
6432 return false;
6435 /* Check that the permutation is supported. */
6436 if (VECTOR_MODE_P (mode))
6438 unsigned int i, j;
6439 if (count == 3)
6441 unsigned int nelt;
6442 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6444 if (dump_enabled_p ())
6445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446 "cannot handle groups of 3 loads for"
6447 " variable-length vectors\n");
6448 return false;
6451 vec_perm_builder sel (nelt, nelt, 1);
6452 sel.quick_grow (nelt);
6453 vec_perm_indices indices;
6454 unsigned int k;
6455 for (k = 0; k < 3; k++)
6457 for (i = 0; i < nelt; i++)
6458 if (3 * i + k < 2 * nelt)
6459 sel[i] = 3 * i + k;
6460 else
6461 sel[i] = 0;
6462 indices.new_vector (sel, 2, nelt);
6463 if (!can_vec_perm_const_p (mode, mode, indices))
6465 if (dump_enabled_p ())
6466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6467 "shuffle of 3 loads is not supported by"
6468 " target\n");
6469 return false;
6471 for (i = 0, j = 0; i < nelt; i++)
6472 if (3 * i + k < 2 * nelt)
6473 sel[i] = i;
6474 else
6475 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6476 indices.new_vector (sel, 2, nelt);
6477 if (!can_vec_perm_const_p (mode, mode, indices))
6479 if (dump_enabled_p ())
6480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6481 "shuffle of 3 loads is not supported by"
6482 " target\n");
6483 return false;
6486 return true;
6488 else
6490 /* If length is not equal to 3 then only power of 2 is supported. */
6491 gcc_assert (pow2p_hwi (count));
6492 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6494 /* The encoding has a single stepped pattern. */
6495 vec_perm_builder sel (nelt, 1, 3);
6496 sel.quick_grow (3);
6497 for (i = 0; i < 3; i++)
6498 sel[i] = i * 2;
6499 vec_perm_indices indices (sel, 2, nelt);
6500 if (can_vec_perm_const_p (mode, mode, indices))
6502 for (i = 0; i < 3; i++)
6503 sel[i] = i * 2 + 1;
6504 indices.new_vector (sel, 2, nelt);
6505 if (can_vec_perm_const_p (mode, mode, indices))
6506 return true;
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 "extract even/odd not supported by target\n");
6514 return false;
6517 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6518 of type VECTYPE. MASKED_P says whether the masked form is needed.
6519 If it is available and ELSVALS is nonzero store the possible else values
6520 in the vector it points to. */
6522 internal_fn
6523 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6524 bool masked_p, vec<int> *elsvals)
6526 if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6527 vec_mask_len_load_lanes_optab, vectype,
6528 count, elsvals))
6529 return IFN_MASK_LEN_LOAD_LANES;
6530 else if (masked_p)
6532 if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6533 vec_mask_load_lanes_optab, vectype,
6534 count, elsvals))
6535 return IFN_MASK_LOAD_LANES;
6537 else
6539 if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6540 vectype, count, elsvals))
6541 return IFN_LOAD_LANES;
6543 return IFN_LAST;
6546 /* Function vect_permute_load_chain.
6548 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6549 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6550 the input data correctly. Return the final references for loads in
6551 RESULT_CHAIN.
6553 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6554 The input is 4 vectors each containing 8 elements. We assign a number to each
6555 element, the input sequence is:
6557 1st vec: 0 1 2 3 4 5 6 7
6558 2nd vec: 8 9 10 11 12 13 14 15
6559 3rd vec: 16 17 18 19 20 21 22 23
6560 4th vec: 24 25 26 27 28 29 30 31
6562 The output sequence should be:
6564 1st vec: 0 4 8 12 16 20 24 28
6565 2nd vec: 1 5 9 13 17 21 25 29
6566 3rd vec: 2 6 10 14 18 22 26 30
6567 4th vec: 3 7 11 15 19 23 27 31
6569 i.e., the first output vector should contain the first elements of each
6570 interleaving group, etc.
6572 We use extract_even/odd instructions to create such output. The input of
6573 each extract_even/odd operation is two vectors
6574 1st vec 2nd vec
6575 0 1 2 3 4 5 6 7
6577 and the output is the vector of extracted even/odd elements. The output of
6578 extract_even will be: 0 2 4 6
6579 and of extract_odd: 1 3 5 7
6582 The permutation is done in log LENGTH stages. In each stage extract_even
6583 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6584 their order. In our example,
6586 E1: extract_even (1st vec, 2nd vec)
6587 E2: extract_odd (1st vec, 2nd vec)
6588 E3: extract_even (3rd vec, 4th vec)
6589 E4: extract_odd (3rd vec, 4th vec)
6591 The output for the first stage will be:
6593 E1: 0 2 4 6 8 10 12 14
6594 E2: 1 3 5 7 9 11 13 15
6595 E3: 16 18 20 22 24 26 28 30
6596 E4: 17 19 21 23 25 27 29 31
6598 In order to proceed and create the correct sequence for the next stage (or
6599 for the correct output, if the second stage is the last one, as in our
6600 example), we first put the output of extract_even operation and then the
6601 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6602 The input for the second stage is:
6604 1st vec (E1): 0 2 4 6 8 10 12 14
6605 2nd vec (E3): 16 18 20 22 24 26 28 30
6606 3rd vec (E2): 1 3 5 7 9 11 13 15
6607 4th vec (E4): 17 19 21 23 25 27 29 31
6609 The output of the second stage:
6611 E1: 0 4 8 12 16 20 24 28
6612 E2: 2 6 10 14 18 22 26 30
6613 E3: 1 5 9 13 17 21 25 29
6614 E4: 3 7 11 15 19 23 27 31
6616 And RESULT_CHAIN after reordering:
6618 1st vec (E1): 0 4 8 12 16 20 24 28
6619 2nd vec (E3): 1 5 9 13 17 21 25 29
6620 3rd vec (E2): 2 6 10 14 18 22 26 30
6621 4th vec (E4): 3 7 11 15 19 23 27 31. */
6623 static void
6624 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6625 unsigned int length,
6626 stmt_vec_info stmt_info,
6627 gimple_stmt_iterator *gsi,
6628 vec<tree> *result_chain)
6630 tree data_ref, first_vect, second_vect;
6631 tree perm_mask_even, perm_mask_odd;
6632 tree perm3_mask_low, perm3_mask_high;
6633 gimple *perm_stmt;
6634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6635 unsigned int i, j, log_length = exact_log2 (length);
6637 result_chain->quick_grow (length);
6638 memcpy (result_chain->address (), dr_chain.address (),
6639 length * sizeof (tree));
6641 if (length == 3)
6643 /* vect_grouped_load_supported ensures that this is constant. */
6644 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6645 unsigned int k;
6647 vec_perm_builder sel (nelt, nelt, 1);
6648 sel.quick_grow (nelt);
6649 vec_perm_indices indices;
6650 for (k = 0; k < 3; k++)
6652 for (i = 0; i < nelt; i++)
6653 if (3 * i + k < 2 * nelt)
6654 sel[i] = 3 * i + k;
6655 else
6656 sel[i] = 0;
6657 indices.new_vector (sel, 2, nelt);
6658 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6660 for (i = 0, j = 0; i < nelt; i++)
6661 if (3 * i + k < 2 * nelt)
6662 sel[i] = i;
6663 else
6664 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6665 indices.new_vector (sel, 2, nelt);
6666 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6668 first_vect = dr_chain[0];
6669 second_vect = dr_chain[1];
6671 /* Create interleaving stmt (low part of):
6672 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6673 ...}> */
6674 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6675 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6676 second_vect, perm3_mask_low);
6677 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6679 /* Create interleaving stmt (high part of):
6680 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6681 ...}> */
6682 first_vect = data_ref;
6683 second_vect = dr_chain[2];
6684 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6685 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6686 second_vect, perm3_mask_high);
6687 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6688 (*result_chain)[k] = data_ref;
6691 else
6693 /* If length is not equal to 3 then only power of 2 is supported. */
6694 gcc_assert (pow2p_hwi (length));
6696 /* The encoding has a single stepped pattern. */
6697 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6698 vec_perm_builder sel (nelt, 1, 3);
6699 sel.quick_grow (3);
6700 for (i = 0; i < 3; ++i)
6701 sel[i] = i * 2;
6702 vec_perm_indices indices (sel, 2, nelt);
6703 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6705 for (i = 0; i < 3; ++i)
6706 sel[i] = i * 2 + 1;
6707 indices.new_vector (sel, 2, nelt);
6708 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6710 for (i = 0; i < log_length; i++)
6712 for (j = 0; j < length; j += 2)
6714 first_vect = dr_chain[j];
6715 second_vect = dr_chain[j+1];
6717 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6718 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6719 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6720 first_vect, second_vect,
6721 perm_mask_even);
6722 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6723 (*result_chain)[j/2] = data_ref;
6725 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6726 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6727 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6728 first_vect, second_vect,
6729 perm_mask_odd);
6730 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6731 (*result_chain)[j/2+length/2] = data_ref;
6733 memcpy (dr_chain.address (), result_chain->address (),
6734 length * sizeof (tree));
6739 /* Function vect_shift_permute_load_chain.
6741 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6742 sequence of stmts to reorder the input data accordingly.
6743 Return the final references for loads in RESULT_CHAIN.
6744 Return true if successed, false otherwise.
6746 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6747 The input is 3 vectors each containing 8 elements. We assign a
6748 number to each element, the input sequence is:
6750 1st vec: 0 1 2 3 4 5 6 7
6751 2nd vec: 8 9 10 11 12 13 14 15
6752 3rd vec: 16 17 18 19 20 21 22 23
6754 The output sequence should be:
6756 1st vec: 0 3 6 9 12 15 18 21
6757 2nd vec: 1 4 7 10 13 16 19 22
6758 3rd vec: 2 5 8 11 14 17 20 23
6760 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6762 First we shuffle all 3 vectors to get correct elements order:
6764 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6765 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6766 3rd vec: (16 19 22) (17 20 23) (18 21)
6768 Next we unite and shift vector 3 times:
6770 1st step:
6771 shift right by 6 the concatenation of:
6772 "1st vec" and "2nd vec"
6773 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6774 "2nd vec" and "3rd vec"
6775 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6776 "3rd vec" and "1st vec"
6777 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6778 | New vectors |
6780 So that now new vectors are:
6782 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6783 2nd vec: (10 13) (16 19 22) (17 20 23)
6784 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6786 2nd step:
6787 shift right by 5 the concatenation of:
6788 "1st vec" and "3rd vec"
6789 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6790 "2nd vec" and "1st vec"
6791 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6792 "3rd vec" and "2nd vec"
6793 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6794 | New vectors |
6796 So that now new vectors are:
6798 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6799 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6800 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6802 3rd step:
6803 shift right by 5 the concatenation of:
6804 "1st vec" and "1st vec"
6805 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6806 shift right by 3 the concatenation of:
6807 "2nd vec" and "2nd vec"
6808 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6809 | New vectors |
6811 So that now all vectors are READY:
6812 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6813 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6814 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6816 This algorithm is faster than one in vect_permute_load_chain if:
6817 1. "shift of a concatination" is faster than general permutation.
6818 This is usually so.
6819 2. The TARGET machine can't execute vector instructions in parallel.
6820 This is because each step of the algorithm depends on previous.
6821 The algorithm in vect_permute_load_chain is much more parallel.
6823 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6826 static bool
6827 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6828 unsigned int length,
6829 stmt_vec_info stmt_info,
6830 gimple_stmt_iterator *gsi,
6831 vec<tree> *result_chain)
6833 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6834 tree perm2_mask1, perm2_mask2, perm3_mask;
6835 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6836 gimple *perm_stmt;
6838 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6839 machine_mode vmode = TYPE_MODE (vectype);
6840 unsigned int i;
6841 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6843 unsigned HOST_WIDE_INT nelt, vf;
6844 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6845 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6846 /* Not supported for variable-length vectors. */
6847 return false;
6849 vec_perm_builder sel (nelt, nelt, 1);
6850 sel.quick_grow (nelt);
6852 result_chain->quick_grow (length);
6853 memcpy (result_chain->address (), dr_chain.address (),
6854 length * sizeof (tree));
6856 if (pow2p_hwi (length) && vf > 4)
6858 unsigned int j, log_length = exact_log2 (length);
6859 for (i = 0; i < nelt / 2; ++i)
6860 sel[i] = i * 2;
6861 for (i = 0; i < nelt / 2; ++i)
6862 sel[nelt / 2 + i] = i * 2 + 1;
6863 vec_perm_indices indices (sel, 2, nelt);
6864 if (!can_vec_perm_const_p (vmode, vmode, indices))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 "shuffle of 2 fields structure is not \
6869 supported by target\n");
6870 return false;
6872 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6874 for (i = 0; i < nelt / 2; ++i)
6875 sel[i] = i * 2 + 1;
6876 for (i = 0; i < nelt / 2; ++i)
6877 sel[nelt / 2 + i] = i * 2;
6878 indices.new_vector (sel, 2, nelt);
6879 if (!can_vec_perm_const_p (vmode, vmode, indices))
6881 if (dump_enabled_p ())
6882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883 "shuffle of 2 fields structure is not \
6884 supported by target\n");
6885 return false;
6887 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6889 /* Generating permutation constant to shift all elements.
6890 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6891 for (i = 0; i < nelt; i++)
6892 sel[i] = nelt / 2 + i;
6893 indices.new_vector (sel, 2, nelt);
6894 if (!can_vec_perm_const_p (vmode, vmode, indices))
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898 "shift permutation is not supported by target\n");
6899 return false;
6901 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6903 /* Generating permutation constant to select vector from 2.
6904 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6905 for (i = 0; i < nelt / 2; i++)
6906 sel[i] = i;
6907 for (i = nelt / 2; i < nelt; i++)
6908 sel[i] = nelt + i;
6909 indices.new_vector (sel, 2, nelt);
6910 if (!can_vec_perm_const_p (vmode, vmode, indices))
6912 if (dump_enabled_p ())
6913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6914 "select is not supported by target\n");
6915 return false;
6917 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6919 for (i = 0; i < log_length; i++)
6921 for (j = 0; j < length; j += 2)
6923 first_vect = dr_chain[j];
6924 second_vect = dr_chain[j + 1];
6926 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6927 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6928 first_vect, first_vect,
6929 perm2_mask1);
6930 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6931 vect[0] = data_ref;
6933 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6934 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6935 second_vect, second_vect,
6936 perm2_mask2);
6937 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6938 vect[1] = data_ref;
6940 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6941 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6942 vect[0], vect[1], shift1_mask);
6943 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6944 (*result_chain)[j/2 + length/2] = data_ref;
6946 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6947 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6948 vect[0], vect[1], select_mask);
6949 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6950 (*result_chain)[j/2] = data_ref;
6952 memcpy (dr_chain.address (), result_chain->address (),
6953 length * sizeof (tree));
6955 return true;
6957 if (length == 3 && vf > 2)
6959 unsigned int k = 0, l = 0;
6961 /* Generating permutation constant to get all elements in rigth order.
6962 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6963 for (i = 0; i < nelt; i++)
6965 if (3 * k + (l % 3) >= nelt)
6967 k = 0;
6968 l += (3 - (nelt % 3));
6970 sel[i] = 3 * k + (l % 3);
6971 k++;
6973 vec_perm_indices indices (sel, 2, nelt);
6974 if (!can_vec_perm_const_p (vmode, vmode, indices))
6976 if (dump_enabled_p ())
6977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978 "shuffle of 3 fields structure is not \
6979 supported by target\n");
6980 return false;
6982 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6984 /* Generating permutation constant to shift all elements.
6985 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6986 for (i = 0; i < nelt; i++)
6987 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6988 indices.new_vector (sel, 2, nelt);
6989 if (!can_vec_perm_const_p (vmode, vmode, indices))
6991 if (dump_enabled_p ())
6992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993 "shift permutation is not supported by target\n");
6994 return false;
6996 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6998 /* Generating permutation constant to shift all elements.
6999 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7000 for (i = 0; i < nelt; i++)
7001 sel[i] = 2 * (nelt / 3) + 1 + i;
7002 indices.new_vector (sel, 2, nelt);
7003 if (!can_vec_perm_const_p (vmode, vmode, indices))
7005 if (dump_enabled_p ())
7006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007 "shift permutation is not supported by target\n");
7008 return false;
7010 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
7012 /* Generating permutation constant to shift all elements.
7013 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
7014 for (i = 0; i < nelt; i++)
7015 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
7016 indices.new_vector (sel, 2, nelt);
7017 if (!can_vec_perm_const_p (vmode, vmode, indices))
7019 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021 "shift permutation is not supported by target\n");
7022 return false;
7024 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
7026 /* Generating permutation constant to shift all elements.
7027 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7028 for (i = 0; i < nelt; i++)
7029 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
7030 indices.new_vector (sel, 2, nelt);
7031 if (!can_vec_perm_const_p (vmode, vmode, indices))
7033 if (dump_enabled_p ())
7034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035 "shift permutation is not supported by target\n");
7036 return false;
7038 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
7040 for (k = 0; k < 3; k++)
7042 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
7043 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7044 dr_chain[k], dr_chain[k],
7045 perm3_mask);
7046 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7047 vect[k] = data_ref;
7050 for (k = 0; k < 3; k++)
7052 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
7053 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7054 vect[k % 3], vect[(k + 1) % 3],
7055 shift1_mask);
7056 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7057 vect_shift[k] = data_ref;
7060 for (k = 0; k < 3; k++)
7062 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
7063 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7064 vect_shift[(4 - k) % 3],
7065 vect_shift[(3 - k) % 3],
7066 shift2_mask);
7067 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7068 vect[k] = data_ref;
7071 (*result_chain)[3 - (nelt % 3)] = vect[2];
7073 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
7074 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
7075 vect[0], shift3_mask);
7076 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7077 (*result_chain)[nelt % 3] = data_ref;
7079 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
7080 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
7081 vect[1], shift4_mask);
7082 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7083 (*result_chain)[0] = data_ref;
7084 return true;
7086 return false;
7089 /* Function vect_transform_grouped_load.
7091 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
7092 to perform their permutation and ascribe the result vectorized statements to
7093 the scalar statements.
7096 void
7097 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
7098 vec<tree> dr_chain,
7099 int size, gimple_stmt_iterator *gsi)
7101 machine_mode mode;
7102 vec<tree> result_chain = vNULL;
7104 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
7105 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
7106 vectors, that are ready for vector computation. */
7107 result_chain.create (size);
7109 /* If reassociation width for vector type is 2 or greater target machine can
7110 execute 2 or more vector instructions in parallel. Otherwise try to
7111 get chain for loads group using vect_shift_permute_load_chain. */
7112 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
7113 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
7114 || pow2p_hwi (size)
7115 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
7116 gsi, &result_chain))
7117 vect_permute_load_chain (vinfo, dr_chain,
7118 size, stmt_info, gsi, &result_chain);
7119 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
7120 result_chain.release ();
7123 /* RESULT_CHAIN contains the output of a group of grouped loads that were
7124 generated as part of the vectorization of STMT_INFO. Assign the statement
7125 for each vector to the associated scalar statement. */
7127 void
7128 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
7129 vec<tree> result_chain)
7131 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7132 unsigned int i, gap_count;
7133 tree tmp_data_ref;
7135 /* Put a permuted data-ref in the VECTORIZED_STMT field.
7136 Since we scan the chain starting from it's first node, their order
7137 corresponds the order of data-refs in RESULT_CHAIN. */
7138 stmt_vec_info next_stmt_info = first_stmt_info;
7139 gap_count = 1;
7140 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7142 if (!next_stmt_info)
7143 break;
7145 /* Skip the gaps. Loads created for the gaps will be removed by dead
7146 code elimination pass later. No need to check for the first stmt in
7147 the group, since it always exists.
7148 DR_GROUP_GAP is the number of steps in elements from the previous
7149 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
7150 correspond to the gaps. */
7151 if (next_stmt_info != first_stmt_info
7152 && gap_count < DR_GROUP_GAP (next_stmt_info))
7154 gap_count++;
7155 continue;
7158 /* ??? The following needs cleanup after the removal of
7159 DR_GROUP_SAME_DR_STMT. */
7160 if (next_stmt_info)
7162 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7163 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7164 copies, and we put the new vector statement last. */
7165 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7167 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7168 gap_count = 1;
7173 /* Function vect_force_dr_alignment_p.
7175 Returns whether the alignment of a DECL can be forced to be aligned
7176 on ALIGNMENT bit boundary. */
7178 bool
7179 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7181 if (!VAR_P (decl))
7182 return false;
7184 if (decl_in_symtab_p (decl)
7185 && !symtab_node::get (decl)->can_increase_alignment_p ())
7186 return false;
7188 if (TREE_STATIC (decl))
7189 return (known_le (alignment,
7190 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7191 else
7192 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7195 /* Return whether the data reference DR_INFO is supported with respect to its
7196 alignment.
7197 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7198 it is aligned, i.e., check if it is possible to vectorize it with different
7199 alignment. */
7201 enum dr_alignment_support
7202 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7203 tree vectype, int misalignment)
7205 data_reference *dr = dr_info->dr;
7206 stmt_vec_info stmt_info = dr_info->stmt;
7207 machine_mode mode = TYPE_MODE (vectype);
7208 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7209 class loop *vect_loop = NULL;
7210 bool nested_in_vect_loop = false;
7212 if (misalignment == 0)
7213 return dr_aligned;
7214 else if (dr_info->need_peeling_for_alignment)
7215 return dr_unaligned_unsupported;
7217 /* For now assume all conditional loads/stores support unaligned
7218 access without any special code. */
7219 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7220 if (gimple_call_internal_p (stmt)
7221 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7222 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7223 return dr_unaligned_supported;
7225 if (loop_vinfo)
7227 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7228 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7231 /* Possibly unaligned access. */
7233 /* We can choose between using the implicit realignment scheme (generating
7234 a misaligned_move stmt) and the explicit realignment scheme (generating
7235 aligned loads with a REALIGN_LOAD). There are two variants to the
7236 explicit realignment scheme: optimized, and unoptimized.
7237 We can optimize the realignment only if the step between consecutive
7238 vector loads is equal to the vector size. Since the vector memory
7239 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7240 is guaranteed that the misalignment amount remains the same throughout the
7241 execution of the vectorized loop. Therefore, we can create the
7242 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7243 at the loop preheader.
7245 However, in the case of outer-loop vectorization, when vectorizing a
7246 memory access in the inner-loop nested within the LOOP that is now being
7247 vectorized, while it is guaranteed that the misalignment of the
7248 vectorized memory access will remain the same in different outer-loop
7249 iterations, it is *not* guaranteed that is will remain the same throughout
7250 the execution of the inner-loop. This is because the inner-loop advances
7251 with the original scalar step (and not in steps of VS). If the inner-loop
7252 step happens to be a multiple of VS, then the misalignment remains fixed
7253 and we can use the optimized realignment scheme. For example:
7255 for (i=0; i<N; i++)
7256 for (j=0; j<M; j++)
7257 s += a[i+j];
7259 When vectorizing the i-loop in the above example, the step between
7260 consecutive vector loads is 1, and so the misalignment does not remain
7261 fixed across the execution of the inner-loop, and the realignment cannot
7262 be optimized (as illustrated in the following pseudo vectorized loop):
7264 for (i=0; i<N; i+=4)
7265 for (j=0; j<M; j++){
7266 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7267 // when j is {0,1,2,3,4,5,6,7,...} respectively.
7268 // (assuming that we start from an aligned address).
7271 We therefore have to use the unoptimized realignment scheme:
7273 for (i=0; i<N; i+=4)
7274 for (j=k; j<M; j+=4)
7275 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7276 // that the misalignment of the initial address is
7277 // 0).
7279 The loop can then be vectorized as follows:
7281 for (k=0; k<4; k++){
7282 rt = get_realignment_token (&vp[k]);
7283 for (i=0; i<N; i+=4){
7284 v1 = vp[i+k];
7285 for (j=k; j<M; j+=4){
7286 v2 = vp[i+j+VS-1];
7287 va = REALIGN_LOAD <v1,v2,rt>;
7288 vs += va;
7289 v1 = v2;
7292 } */
7294 if (DR_IS_READ (dr))
7296 if (can_implement_p (vec_realign_load_optab, mode)
7297 && (!targetm.vectorize.builtin_mask_for_load
7298 || targetm.vectorize.builtin_mask_for_load ()))
7300 /* If we are doing SLP then the accesses need not have the
7301 same alignment, instead it depends on the SLP group size. */
7302 if (loop_vinfo
7303 && STMT_SLP_TYPE (stmt_info)
7304 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
7305 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7306 * (DR_GROUP_SIZE
7307 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7308 TYPE_VECTOR_SUBPARTS (vectype)))
7310 else if (!loop_vinfo
7311 || (nested_in_vect_loop
7312 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7313 GET_MODE_SIZE (TYPE_MODE (vectype)))))
7314 return dr_explicit_realign;
7315 else
7316 return dr_explicit_realign_optimized;
7320 bool is_packed = false;
7321 tree type = TREE_TYPE (DR_REF (dr));
7322 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7323 is_packed = not_size_aligned (DR_REF (dr));
7324 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7325 is_packed))
7326 return dr_unaligned_supported;
7328 /* Unsupported. */
7329 return dr_unaligned_unsupported;