gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2025 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "pretty-print.h"
  38 #include "alias.h"
  39 #include "fold-const.h"
  40 #include "stor-layout.h"
  41 #include "tree-eh.h"
  42 #include "gimplify.h"
  43 #include "gimple-iterator.h"
  44 #include "gimplify-me.h"
  45 #include "tree-ssa-loop-ivopts.h"
  46 #include "tree-ssa-loop-manip.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "expr.h"
  52 #include "builtins.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57 #include "gimple-fold.h"
  58 #include "optabs-query.h"
  59
  60 /* Return true if load- or store-lanes optab OPTAB is implemented for
  61    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.
  62
  63    If it is implemented and ELSVALS is nonzero store the possible else
  64    values in the vector it points to.  */
  65
  66 static bool
  67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  68                               tree vectype, unsigned HOST_WIDE_INT count,
  69                               vec<int> *elsvals = nullptr)
  70 {
  71   machine_mode mode, array_mode;
  72   bool limit_p;
  73
  74   mode = TYPE_MODE (vectype);
  75   if (!targetm.array_mode (mode, count).exists (&array_mode))
  76     {
  77       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  78       limit_p = !targetm.array_mode_supported_p (mode, count);
  79       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  80         {
  81           if (dump_enabled_p ())
  82             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                              "no array mode for %s[%wu]\n",
  84                              GET_MODE_NAME (mode), count);
  85           return false;
  86         }
  87     }
  88
  89   enum insn_code icode;
  90   if ((icode = convert_optab_handler (optab, array_mode, mode))
  91       == CODE_FOR_nothing)
  92     {
  93       if (dump_enabled_p ())
  94         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  95                          "cannot use %s<%s><%s>\n", name,
  96                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  97       return false;
  98     }
  99
 100   if (dump_enabled_p ())
 101     dump_printf_loc (MSG_NOTE, vect_location,
 102                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 103                      GET_MODE_NAME (mode));
 104
 105   if (elsvals)
 106     get_supported_else_vals (icode,
 107                              internal_fn_else_index (IFN_MASK_LEN_LOAD_LANES),
 108                              *elsvals);
 109
 110   return true;
 111 }
 112
 113 /* Helper function to identify a simd clone call.  If this is a call to a
 114    function with simd clones then return the corresponding cgraph_node,
 115    otherwise return NULL.  */
 116
 117 static cgraph_node*
 118 simd_clone_call_p (gimple *stmt)
 119 {
 120   gcall *call = dyn_cast <gcall *> (stmt);
 121   if (!call)
 122     return NULL;
 123
 124   tree fndecl = NULL_TREE;
 125   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 126     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 127   else
 128     fndecl = gimple_call_fndecl (stmt);
 129
 130   if (fndecl == NULL_TREE)
 131     return NULL;
 132
 133   cgraph_node *node = cgraph_node::get (fndecl);
 134   if (node && node->simd_clones != NULL)
 135     return node;
 136
 137   return NULL;
 138 }
 139
 140
 141
 142 /* Return the smallest scalar part of STMT_INFO.
 143    This is used to determine the vectype of the stmt.  We generally set the
 144    vectype according to the type of the result (lhs).  For stmts whose
 145    result-type is different than the type of the arguments (e.g., demotion,
 146    promotion), vectype will be reset appropriately (later).  Note that we have
 147    to visit the smallest datatype in this function, because that determines the
 148    VF.  If the smallest datatype in the loop is present only as the rhs of a
 149    promotion operation - we'd miss it.
 150    Such a case, where a variable of this datatype does not appear in the lhs
 151    anywhere in the loop, can only occur if it's an invariant: e.g.:
 152    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 153    invariant motion.  However, we cannot rely on invariant motion to always
 154    take invariants out of the loop, and so in the case of promotion we also
 155    have to check the rhs.
 156    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 157    types.  */
 158
 159 tree
 160 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 161 {
 162   HOST_WIDE_INT lhs, rhs;
 163
 164   /* During the analysis phase, this function is called on arbitrary
 165      statements that might not have scalar results.  */
 166   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 167     return scalar_type;
 168
 169   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 170
 171   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 172   if (assign)
 173     {
 174       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 175       if (gimple_assign_cast_p (assign)
 176           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 177           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 178           || gimple_assign_rhs_code (assign) == SAD_EXPR
 179           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 180           || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
 181           || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
 182           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 183           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 184         {
 185           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 186
 187           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 188           if (rhs < lhs)
 189             scalar_type = rhs_type;
 190         }
 191     }
 192   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 193     {
 194       auto clone = node->simd_clones->simdclone;
 195       for (unsigned int i = 0; i < clone->nargs; ++i)
 196         {
 197           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 198             {
 199               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 200               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 201               if (rhs < lhs)
 202                 {
 203                   scalar_type = arg_scalar_type;
 204                   lhs = rhs;
 205                 }
 206             }
 207         }
 208     }
 209   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 210     {
 211       unsigned int i = 0;
 212       if (gimple_call_internal_p (call))
 213         {
 214           internal_fn ifn = gimple_call_internal_fn (call);
 215           if (internal_load_fn_p (ifn))
 216             /* For loads the LHS type does the trick.  */
 217             i = ~0U;
 218           else if (internal_store_fn_p (ifn))
 219             {
 220               /* For stores use the tyep of the stored value.  */
 221               i = internal_fn_stored_value_index (ifn);
 222               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 223               i = ~0U;
 224             }
 225           else if (internal_fn_mask_index (ifn) == 0)
 226             i = 1;
 227         }
 228       if (i < gimple_call_num_args (call))
 229         {
 230           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 231           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 232             {
 233               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 234               if (rhs < lhs)
 235                 scalar_type = rhs_type;
 236             }
 237         }
 238     }
 239
 240   return scalar_type;
 241 }
 242
 243
 244 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 245    tested at run-time.  Return TRUE if DDR was successfully inserted.
 246    Return false if versioning is not supported.  */
 247
 248 static opt_result
 249 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 250 {
 251   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 252
 253   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 254     return opt_result::failure_at (vect_location,
 255                                    "will not create alias checks, as"
 256                                    " --param vect-max-version-for-alias-checks"
 257                                    " == 0\n");
 258
 259   opt_result res
 260     = runtime_alias_check_p (ddr, loop,
 261                              optimize_loop_nest_for_speed_p (loop));
 262   if (!res)
 263     return res;
 264
 265   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 266   return opt_result::success ();
 267 }
 268
 269 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 270
 271 static void
 272 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 273 {
 274   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 275   for (unsigned int i = 0; i < checks.length(); ++i)
 276     if (checks[i] == value)
 277       return;
 278
 279   if (dump_enabled_p ())
 280     dump_printf_loc (MSG_NOTE, vect_location,
 281                      "need run-time check that %T is nonzero\n",
 282                      value);
 283   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 284 }
 285
 286 /* Return true if we know that the order of vectorized DR_INFO_A and
 287    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 288    DR_INFO_B.  At least one of the accesses is a write.  */
 289
 290 static bool
 291 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 292 {
 293   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 294   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 295
 296   /* Single statements are always kept in their original order.  */
 297   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 298       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 299     return true;
 300
 301   /* If there is a loop invariant read involved we might vectorize it in
 302      the prologue, breaking scalar oder with respect to the in-loop store.  */
 303   if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
 304       || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
 305     return false;
 306
 307   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 308      emitted at the position of the first scalar load.
 309      Stores in a group are emitted at the position of the last scalar store.
 310      Compute that position and check whether the resulting order matches
 311      the current one.  */
 312   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 313   if (il_a)
 314     {
 315       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 316         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 317              s = DR_GROUP_NEXT_ELEMENT (s))
 318           il_a = get_later_stmt (il_a, s);
 319       else /* DR_IS_READ */
 320         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 321              s = DR_GROUP_NEXT_ELEMENT (s))
 322           if (get_later_stmt (il_a, s) == il_a)
 323             il_a = s;
 324     }
 325   else
 326     il_a = stmtinfo_a;
 327   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 328   if (il_b)
 329     {
 330       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 331         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 332              s = DR_GROUP_NEXT_ELEMENT (s))
 333           il_b = get_later_stmt (il_b, s);
 334       else /* DR_IS_READ */
 335         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 336              s = DR_GROUP_NEXT_ELEMENT (s))
 337           if (get_later_stmt (il_b, s) == il_b)
 338             il_b = s;
 339     }
 340   else
 341     il_b = stmtinfo_b;
 342   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 343   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 344 }
 345
 346 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 347    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 348    distances.  These distances are conservatively correct but they don't
 349    reflect a guaranteed dependence.
 350
 351    Return true if this function does all the work necessary to avoid
 352    an alias or false if the caller should use the dependence distances
 353    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 354    the depth of the loop described by LOOP_VINFO and the other arguments
 355    are as for vect_analyze_data_ref_dependence.  */
 356
 357 static bool
 358 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 359                                        loop_vec_info loop_vinfo,
 360                                        int loop_depth, unsigned int *max_vf)
 361 {
 362   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 363   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 364     {
 365       int dist = dist_v[loop_depth];
 366       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 367         {
 368           /* If the user asserted safelen >= DIST consecutive iterations
 369              can be executed concurrently, assume independence.
 370
 371              ??? An alternative would be to add the alias check even
 372              in this case, and vectorize the fallback loop with the
 373              maximum VF set to safelen.  However, if the user has
 374              explicitly given a length, it's less likely that that
 375              would be a win.  */
 376           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 377             {
 378               if ((unsigned int) loop->safelen < *max_vf)
 379                 *max_vf = loop->safelen;
 380               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 381               continue;
 382             }
 383
 384           /* For dependence distances of 2 or more, we have the option
 385              of limiting VF or checking for an alias at runtime.
 386              Prefer to check at runtime if we can, to avoid limiting
 387              the VF unnecessarily when the bases are in fact independent.
 388
 389              Note that the alias checks will be removed if the VF ends up
 390              being small enough.  */
 391           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 392           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 393           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 394                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 395                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 396         }
 397     }
 398   return true;
 399 }
 400
 401
 402 /* Function vect_analyze_data_ref_dependence.
 403
 404    FIXME: I needed to change the sense of the returned flag.
 405
 406    Return FALSE if there (might) exist a dependence between a memory-reference
 407    DRA and a memory-reference DRB.  When versioning for alias may check a
 408    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 409    the data dependence.  */
 410
 411 static opt_result
 412 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 413                                   loop_vec_info loop_vinfo,
 414                                   unsigned int *max_vf)
 415 {
 416   unsigned int i;
 417   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 418   struct data_reference *dra = DDR_A (ddr);
 419   struct data_reference *drb = DDR_B (ddr);
 420   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 421   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 422   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 423   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 424   lambda_vector dist_v;
 425   unsigned int loop_depth;
 426
 427   /* If user asserted safelen consecutive iterations can be
 428      executed concurrently, assume independence.  */
 429   auto apply_safelen = [&]()
 430     {
 431       if (loop->safelen >= 2)
 432         {
 433           if ((unsigned int) loop->safelen < *max_vf)
 434             *max_vf = loop->safelen;
 435           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 436           return true;
 437         }
 438       return false;
 439     };
 440
 441   /* In loop analysis all data references should be vectorizable.  */
 442   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 443       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 444     gcc_unreachable ();
 445
 446   /* Independent data accesses.  */
 447   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 448     return opt_result::success ();
 449
 450   if (dra == drb
 451       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 452     return opt_result::success ();
 453
 454   /* We do not have to consider dependences between accesses that belong
 455      to the same group, unless the stride could be smaller than the
 456      group size.  */
 457   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 458       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 459           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 460       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 461     return opt_result::success ();
 462
 463   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 464      least two scalar iterations, there is always also a true dependence.
 465      As the vectorizer does not re-order loads and stores we can ignore
 466      the anti-dependence if TBAA can disambiguate both DRs similar to the
 467      case with known negative distance anti-dependences (positive
 468      distance anti-dependences would violate TBAA constraints).  */
 469   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 470        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 471       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 472                                  get_alias_set (DR_REF (drb))))
 473     return opt_result::success ();
 474
 475   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 476       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 477     {
 478       if (apply_safelen ())
 479         return opt_result::success ();
 480
 481       return opt_result::failure_at
 482         (stmtinfo_a->stmt,
 483          "possible alias involving gather/scatter between %T and %T\n",
 484          DR_REF (dra), DR_REF (drb));
 485     }
 486
 487   /* Unknown data dependence.  */
 488   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 489     {
 490       if (apply_safelen ())
 491         return opt_result::success ();
 492
 493       if (dump_enabled_p ())
 494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 495                          "versioning for alias required: "
 496                          "can't determine dependence between %T and %T\n",
 497                          DR_REF (dra), DR_REF (drb));
 498
 499       /* Add to list of ddrs that need to be tested at run-time.  */
 500       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 501     }
 502
 503   /* Known data dependence.  */
 504   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 505     {
 506       if (apply_safelen ())
 507         return opt_result::success ();
 508
 509       if (dump_enabled_p ())
 510         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 511                          "versioning for alias required: "
 512                          "bad dist vector for %T and %T\n",
 513                          DR_REF (dra), DR_REF (drb));
 514       /* Add to list of ddrs that need to be tested at run-time.  */
 515       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 516     }
 517
 518   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 519
 520   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 521       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 522                                                 loop_depth, max_vf))
 523     return opt_result::success ();
 524
 525   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 526     {
 527       int dist = dist_v[loop_depth];
 528
 529       if (dump_enabled_p ())
 530         dump_printf_loc (MSG_NOTE, vect_location,
 531                          "dependence distance  = %d.\n", dist);
 532
 533       if (dist == 0)
 534         {
 535           if (dump_enabled_p ())
 536             dump_printf_loc (MSG_NOTE, vect_location,
 537                              "dependence distance == 0 between %T and %T\n",
 538                              DR_REF (dra), DR_REF (drb));
 539
 540           /* When we perform grouped accesses and perform implicit CSE
 541              by detecting equal accesses and doing disambiguation with
 542              runtime alias tests like for
 543                 .. = a[i];
 544                 .. = a[i+1];
 545                 a[i] = ..;
 546                 a[i+1] = ..;
 547                 *p = ..;
 548                 .. = a[i];
 549                 .. = a[i+1];
 550              where we will end up loading { a[i], a[i+1] } once, make
 551              sure that inserting group loads before the first load and
 552              stores after the last store will do the right thing.
 553              Similar for groups like
 554                 a[i] = ...;
 555                 ... = a[i];
 556                 a[i+1] = ...;
 557              where loads from the group interleave with the store.  */
 558           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 559             return opt_result::failure_at (stmtinfo_a->stmt,
 560                                            "READ_WRITE dependence"
 561                                            " in interleaving.\n");
 562
 563           if (loop->safelen < 2)
 564             {
 565               tree indicator = dr_zero_step_indicator (dra);
 566               if (!indicator || integer_zerop (indicator))
 567                 return opt_result::failure_at (stmtinfo_a->stmt,
 568                                                "access also has a zero step\n");
 569               else if (TREE_CODE (indicator) != INTEGER_CST)
 570                 vect_check_nonzero_value (loop_vinfo, indicator);
 571             }
 572           continue;
 573         }
 574
 575       if (dist > 0 && DDR_REVERSED_P (ddr))
 576         {
 577           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 578              reversed (to make distance vector positive), and the actual
 579              distance is negative.  */
 580           if (dump_enabled_p ())
 581             dump_printf_loc (MSG_NOTE, vect_location,
 582                              "dependence distance negative.\n");
 583           /* When doing outer loop vectorization, we need to check if there is
 584              a backward dependence at the inner loop level if the dependence
 585              at the outer loop is reversed.  See PR81740.  */
 586           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 587               || nested_in_vect_loop_p (loop, stmtinfo_b))
 588             {
 589               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 590                                                          DDR_LOOP_NEST (ddr));
 591               if (dist_v[inner_depth] < 0)
 592                 return opt_result::failure_at (stmtinfo_a->stmt,
 593                                                "not vectorized, dependence "
 594                                                "between data-refs %T and %T\n",
 595                                                DR_REF (dra), DR_REF (drb));
 596             }
 597           /* Record a negative dependence distance to later limit the
 598              amount of stmt copying / unrolling we can perform.
 599              Only need to handle read-after-write dependence.  */
 600           if (DR_IS_READ (drb)
 601               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 602                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 603             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 604           continue;
 605         }
 606
 607       unsigned int abs_dist = abs (dist);
 608       if (abs_dist >= 2 && abs_dist < *max_vf)
 609         {
 610           /* The dependence distance requires reduction of the maximal
 611              vectorization factor.  */
 612           *max_vf = abs_dist;
 613           if (dump_enabled_p ())
 614             dump_printf_loc (MSG_NOTE, vect_location,
 615                              "adjusting maximal vectorization factor to %i\n",
 616                              *max_vf);
 617         }
 618
 619       if (abs_dist >= *max_vf)
 620         {
 621           /* Dependence distance does not create dependence, as far as
 622              vectorization is concerned, in this case.  */
 623           if (dump_enabled_p ())
 624             dump_printf_loc (MSG_NOTE, vect_location,
 625                              "dependence distance >= VF.\n");
 626           continue;
 627         }
 628
 629       return opt_result::failure_at (stmtinfo_a->stmt,
 630                                      "not vectorized, possible dependence "
 631                                      "between data-refs %T and %T\n",
 632                                      DR_REF (dra), DR_REF (drb));
 633     }
 634
 635   return opt_result::success ();
 636 }
 637
 638 /* Function vect_analyze_early_break_dependences.
 639
 640    Examine all the data references in the loop and make sure that if we have
 641    multiple exits that we are able to safely move stores such that they become
 642    safe for vectorization.  The function also calculates the place where to move
 643    the instructions to and computes what the new vUSE chain should be.
 644
 645    This works in tandem with the CFG that will be produced by
 646    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 647
 648    This function tries to validate whether an early break vectorization
 649    is possible for the current instruction sequence. Returns True i
 650    possible, otherwise False.
 651
 652    Requirements:
 653      - Any memory access must be to a fixed size buffer.
 654      - There must not be any loads and stores to the same object.
 655      - Multiple loads are allowed as long as they don't alias.
 656
 657    NOTE:
 658      This implementation is very conservative. Any overlapping loads/stores
 659      that take place before the early break statement gets rejected aside from
 660      WAR dependencies.
 661
 662      i.e.:
 663
 664         a[i] = 8
 665         c = a[i]
 666         if (b[i])
 667           ...
 668
 669         is not allowed, but
 670
 671         c = a[i]
 672         a[i] = 8
 673         if (b[i])
 674           ...
 675
 676         is which is the common case.  */
 677
 678 static opt_result
 679 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 680 {
 681   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 682
 683   /* List of all load data references found during traversal.  */
 684   auto_vec<data_reference *> bases;
 685   basic_block dest_bb = NULL;
 686
 687   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 688   class loop *loop_nest = loop_outer (loop);
 689
 690   if (dump_enabled_p ())
 691     dump_printf_loc (MSG_NOTE, vect_location,
 692                      "loop contains multiple exits, analyzing"
 693                      " statement dependencies.\n");
 694
 695   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 696     if (dump_enabled_p ())
 697       dump_printf_loc (MSG_NOTE, vect_location,
 698                        "alternate exit has been chosen as main exit.\n");
 699
 700   /* Since we don't support general control flow, the location we'll move the
 701      side-effects to is always the latch connected exit.  When we support
 702      general control flow we can do better but for now this is fine.  Move
 703      side-effects to the in-loop destination of the last early exit.  For the
 704      PEELED case we move the side-effects to the latch block as this is
 705      guaranteed to be the last block to be executed when a vector iteration
 706      finished.  */
 707   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 708     dest_bb = loop->latch;
 709   else
 710     dest_bb = single_pred (loop->latch);
 711
 712   /* We start looking from dest_bb, for the non-PEELED case we don't want to
 713      move any stores already present, but we do want to read and validate the
 714      loads.  */
 715   basic_block bb = dest_bb;
 716
 717   /* We move stores across all loads to the beginning of dest_bb, so
 718      the first block processed below doesn't need dependence checking.  */
 719   bool check_deps = false;
 720
 721   do
 722     {
 723       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 724
 725       /* Now analyze all the remaining statements and try to determine which
 726          instructions are allowed/needed to be moved.  */
 727       while (!gsi_end_p (gsi))
 728         {
 729           gimple *stmt = gsi_stmt (gsi);
 730           gsi_prev (&gsi);
 731           if (is_gimple_debug (stmt))
 732             continue;
 733
 734           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 735           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 736           if (!dr_ref)
 737             continue;
 738
 739           /* We know everything below dest_bb is safe since we know we
 740              had a full vector iteration when reaching it.  Either by
 741              the loop entry / IV exit test being last or because this
 742              is the loop latch itself.  */
 743           if (!check_deps)
 744             continue;
 745
 746           /* Check if vector accesses to the object will be within bounds.
 747              must be a constant or assume loop will be versioned or niters
 748              bounded by VF so accesses are within range.  We only need to check
 749              the reads since writes are moved to a safe place where if we get
 750              there we know they are safe to perform.  */
 751           if (DR_IS_READ (dr_ref)
 752               && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
 753             {
 754               if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)
 755                   || STMT_VINFO_STRIDED_P (stmt_vinfo))
 756                 {
 757                   const char *msg
 758                     = "early break not supported: cannot peel "
 759                       "for alignment, vectorization would read out of "
 760                       "bounds at %G";
 761                   return opt_result::failure_at (stmt, msg, stmt);
 762                 }
 763
 764               dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
 765               dr_info->need_peeling_for_alignment = true;
 766
 767               if (dump_enabled_p ())
 768                 dump_printf_loc (MSG_NOTE, vect_location,
 769                                  "marking DR (read) as needing peeling for "
 770                                  "alignment at %G", stmt);
 771             }
 772
 773           if (DR_IS_READ (dr_ref))
 774             bases.safe_push (dr_ref);
 775           else if (DR_IS_WRITE (dr_ref))
 776             {
 777               /* We are moving writes down in the CFG.  To be sure that this
 778                  is valid after vectorization we have to check all the loads
 779                  we are sinking the stores past to see if any of them may
 780                  alias or are the same object.
 781
 782                  Same objects will not be an issue because unless the store
 783                  is marked volatile the value can be forwarded.  If the
 784                  store is marked volatile we don't vectorize the loop
 785                  anyway.
 786
 787                  That leaves the check for aliasing.  We don't really need
 788                  to care about the stores aliasing with each other since the
 789                  stores are moved in order so the effects are still observed
 790                  correctly.  This leaves the check for WAR dependencies
 791                  which we would be introducing here if the DR can alias.
 792                  The check is quadratic in loads/stores but I have not found
 793                  a better API to do this.  I believe all loads and stores
 794                  must be checked.  We also must check them when we
 795                  encountered the store, since we don't care about loads past
 796                  the store.  */
 797
 798               for (auto dr_read : bases)
 799                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 800                   {
 801                     if (dump_enabled_p ())
 802                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 803                                        vect_location,
 804                                        "early breaks not supported: "
 805                                        "overlapping loads and stores "
 806                                        "found before the break "
 807                                        "statement.\n");
 808
 809                     return opt_result::failure_at (stmt,
 810                              "can't safely apply code motion to dependencies"
 811                              " to vectorize the early exit. %G may alias with"
 812                              " %G\n", stmt, dr_read->stmt);
 813                   }
 814             }
 815
 816           if (gimple_vdef (stmt))
 817             {
 818               if (dump_enabled_p ())
 819                 dump_printf_loc (MSG_NOTE, vect_location,
 820                                  "==> recording stmt %G", stmt);
 821
 822               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 823             }
 824           else if (gimple_vuse (stmt))
 825             {
 826               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 827               if (dump_enabled_p ())
 828                 dump_printf_loc (MSG_NOTE, vect_location,
 829                                  "marked statement for vUSE update: %G", stmt);
 830             }
 831         }
 832
 833       if (!single_pred_p (bb))
 834         {
 835           gcc_assert (bb == loop->header);
 836           break;
 837         }
 838
 839       /* If we possibly sink through a virtual PHI make sure to elide that.  */
 840       if (gphi *vphi = get_virtual_phi (bb))
 841         LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
 842
 843       /* All earlier blocks need dependence checking.  */
 844       check_deps = true;
 845       bb = single_pred (bb);
 846     }
 847   while (1);
 848
 849   /* We don't allow outer -> inner loop transitions which should have been
 850      trapped already during loop form analysis.  */
 851   gcc_assert (dest_bb->loop_father == loop);
 852
 853   /* Check that the destination block we picked has only one pred.  To relax this we
 854      have to take special care when moving the statements.  We don't currently support
 855      such control flow however this check is there to simplify how we handle
 856      labels that may be present anywhere in the IL.  This check is to ensure that the
 857      labels aren't significant for the CFG.  */
 858   if (!single_pred (dest_bb))
 859     return opt_result::failure_at (vect_location,
 860                              "chosen loop exit block (BB %d) does not have a "
 861                              "single predecessor which is currently not "
 862                              "supported for early break vectorization.\n",
 863                              dest_bb->index);
 864
 865   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 866
 867   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 868     {
 869       /* All uses shall be updated to that of the first load.  Entries are
 870          stored in reverse order.  */
 871       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 872       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 873         {
 874           if (dump_enabled_p ())
 875           dump_printf_loc (MSG_NOTE, vect_location,
 876                            "will update use: %T, mem_ref: %G", vuse, g);
 877         }
 878     }
 879
 880   if (dump_enabled_p ())
 881     dump_printf_loc (MSG_NOTE, vect_location,
 882                      "recorded statements to be moved to BB %d\n",
 883                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 884
 885   return opt_result::success ();
 886 }
 887
 888 /* Function vect_analyze_data_ref_dependences.
 889
 890    Examine all the data references in the loop, and make sure there do not
 891    exist any data dependences between them.  Set *MAX_VF according to
 892    the maximum vectorization factor the data dependences allow.  */
 893
 894 opt_result
 895 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 896                                    unsigned int *max_vf)
 897 {
 898   unsigned int i;
 899   struct data_dependence_relation *ddr;
 900
 901   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 902
 903   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 904     {
 905       LOOP_VINFO_DDRS (loop_vinfo)
 906         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 907                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 908       /* We do not need read-read dependences.  */
 909       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 910                                           &LOOP_VINFO_DDRS (loop_vinfo),
 911                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 912                                           false);
 913       gcc_assert (res);
 914     }
 915
 916   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 917
 918   /* For epilogues we either have no aliases or alias versioning
 919      was applied to original loop.  Therefore we may just get max_vf
 920      using VF of original loop.  */
 921   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 922     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 923   else
 924     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 925       {
 926         opt_result res
 927           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 928         if (!res)
 929           return res;
 930       }
 931
 932   /* If we have early break statements in the loop, check to see if they
 933      are of a form we can vectorizer.  */
 934   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 935     return vect_analyze_early_break_dependences (loop_vinfo);
 936
 937   return opt_result::success ();
 938 }
 939
 940
 941 /* Function vect_slp_analyze_data_ref_dependence.
 942
 943    Return TRUE if there (might) exist a dependence between a memory-reference
 944    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 945    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 946    according to the data dependence.  */
 947
 948 static bool
 949 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 950                                       struct data_dependence_relation *ddr)
 951 {
 952   struct data_reference *dra = DDR_A (ddr);
 953   struct data_reference *drb = DDR_B (ddr);
 954   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 955   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 956
 957   /* We need to check dependences of statements marked as unvectorizable
 958      as well, they still can prohibit vectorization.  */
 959
 960   /* Independent data accesses.  */
 961   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 962     return false;
 963
 964   if (dra == drb)
 965     return false;
 966
 967   /* Read-read is OK.  */
 968   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 969     return false;
 970
 971   /* If dra and drb are part of the same interleaving chain consider
 972      them independent.  */
 973   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 974       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 975           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 976     return false;
 977
 978   /* Unknown data dependence.  */
 979   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 980     {
 981       if  (dump_enabled_p ())
 982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 983                          "can't determine dependence between %T and %T\n",
 984                          DR_REF (dra), DR_REF (drb));
 985     }
 986   else if (dump_enabled_p ())
 987     dump_printf_loc (MSG_NOTE, vect_location,
 988                      "determined dependence between %T and %T\n",
 989                      DR_REF (dra), DR_REF (drb));
 990
 991   return true;
 992 }
 993
 994
 995 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 996
 997 static bool
 998 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 999 {
1000   /* This walks over all stmts involved in the SLP store done
1001      in NODE verifying we can sink them up to the last stmt in the
1002      group.  */
1003   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
1004   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
1005
1006   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1007     {
1008       stmt_vec_info access_info
1009         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1010       if (access_info == last_access_info)
1011         continue;
1012       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1013       ao_ref ref;
1014       bool ref_initialized_p = false;
1015       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1016            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
1017         {
1018           gimple *stmt = gsi_stmt (gsi);
1019           if (! gimple_vuse (stmt))
1020             continue;
1021
1022           /* If we couldn't record a (single) data reference for this
1023              stmt we have to resort to the alias oracle.  */
1024           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1025           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1026           if (!dr_b)
1027             {
1028               /* We are moving a store - this means
1029                  we cannot use TBAA for disambiguation.  */
1030               if (!ref_initialized_p)
1031                 ao_ref_init (&ref, DR_REF (dr_a));
1032               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1033                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1034                 return false;
1035               continue;
1036             }
1037
1038           gcc_assert (!gimple_visited_p (stmt));
1039
1040           ddr_p ddr = initialize_data_dependence_relation (dr_a,
1041                                                            dr_b, vNULL);
1042           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1043           free_dependence_relation (ddr);
1044           if (dependent)
1045             return false;
1046         }
1047     }
1048   return true;
1049 }
1050
1051 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1052    contain the vector of scalar stores of this instance if we are
1053    disambiguating the loads.  */
1054
1055 static bool
1056 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1057                                    vec<stmt_vec_info> stores,
1058                                    stmt_vec_info last_store_info)
1059 {
1060   /* This walks over all stmts involved in the SLP load done
1061      in NODE verifying we can hoist them up to the first stmt in the
1062      group.  */
1063   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1064   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1065
1066   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1067     {
1068       if (! SLP_TREE_SCALAR_STMTS (node)[k])
1069         continue;
1070       stmt_vec_info access_info
1071         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1072       if (access_info == first_access_info)
1073         continue;
1074       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1075       ao_ref ref;
1076       bool ref_initialized_p = false;
1077       hash_set<stmt_vec_info> grp_visited;
1078       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1079            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1080         {
1081           gimple *stmt = gsi_stmt (gsi);
1082           if (! gimple_vdef (stmt))
1083             continue;
1084
1085           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1086
1087           /* If we run into a store of this same instance (we've just
1088              marked those) then delay dependence checking until we run
1089              into the last store because this is where it will have
1090              been sunk to (and we verified that we can do that already).  */
1091           if (gimple_visited_p (stmt))
1092             {
1093               if (stmt_info != last_store_info)
1094                 continue;
1095
1096               for (stmt_vec_info &store_info : stores)
1097                 {
1098                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1099                   ddr_p ddr = initialize_data_dependence_relation
1100                                 (dr_a, store_dr, vNULL);
1101                   bool dependent
1102                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1103                   free_dependence_relation (ddr);
1104                   if (dependent)
1105                     return false;
1106                 }
1107               continue;
1108             }
1109
1110           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1111             {
1112               /* We are hoisting a load - this means we can use TBAA for
1113                  disambiguation.  */
1114               if (!ref_initialized_p)
1115                 ao_ref_init (&ref, DR_REF (dr_a));
1116               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1117                 {
1118                   /* If we couldn't record a (single) data reference for this
1119                      stmt we have to give up now.  */
1120                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1121                   if (!dr_b)
1122                     return false;
1123                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1124                                                                    dr_b, vNULL);
1125                   bool dependent
1126                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1127                   free_dependence_relation (ddr);
1128                   if (dependent)
1129                     return false;
1130                 }
1131               /* No dependence.  */
1132               return true;
1133             };
1134           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1135             {
1136               /* When we run into a store group we have to honor
1137                  that earlier stores might be moved here.  We don't
1138                  know exactly which and where to since we lack a
1139                  back-mapping from DR to SLP node, so assume all
1140                  earlier stores are sunk here.  It's enough to
1141                  consider the last stmt of a group for this.
1142                  ???  Both this and the fact that we disregard that
1143                  the conflicting instance might be removed later
1144                  is overly conservative.  */
1145               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1146                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1147                      store_info != NULL;
1148                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1149                   if ((store_info == stmt_info
1150                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1151                       && !check_hoist (store_info))
1152                     return false;
1153             }
1154           else
1155             {
1156               if (!check_hoist (stmt_info))
1157                 return false;
1158             }
1159         }
1160     }
1161   return true;
1162 }
1163
1164
1165 /* Function vect_analyze_data_ref_dependences.
1166
1167    Examine all the data references in the basic-block, and make sure there
1168    do not exist any data dependences between them.  Set *MAX_VF according to
1169    the maximum vectorization factor the data dependences allow.  */
1170
1171 bool
1172 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1173 {
1174   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1175
1176   /* The stores of this instance are at the root of the SLP tree.  */
1177   slp_tree store = NULL;
1178   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1179     store = SLP_INSTANCE_TREE (instance);
1180
1181   /* Verify we can sink stores to the vectorized stmt insert location.  */
1182   stmt_vec_info last_store_info = NULL;
1183   if (store)
1184     {
1185       if (! vect_slp_analyze_store_dependences (vinfo, store))
1186         return false;
1187
1188       /* Mark stores in this instance and remember the last one.  */
1189       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1190       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1191         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1192     }
1193
1194   bool res = true;
1195
1196   /* Verify we can sink loads to the vectorized stmt insert location,
1197      special-casing stores of this instance.  */
1198   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1199     if (! vect_slp_analyze_load_dependences (vinfo, load,
1200                                              store
1201                                              ? SLP_TREE_SCALAR_STMTS (store)
1202                                              : vNULL, last_store_info))
1203       {
1204         res = false;
1205         break;
1206       }
1207
1208   /* Unset the visited flag.  */
1209   if (store)
1210     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1211       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1212
1213   return res;
1214 }
1215
1216 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1217    applied.  */
1218
1219 int
1220 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1221 {
1222   HOST_WIDE_INT diff = 0;
1223   /* Alignment is only analyzed for the first element of a DR group,
1224      use that but adjust misalignment by the offset of the access.  */
1225   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1226     {
1227       dr_vec_info *first_dr
1228         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1229       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1230          INTEGER_CSTs and the first element in the group has the lowest
1231          address.  */
1232       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1233               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1234       gcc_assert (diff >= 0);
1235       dr_info = first_dr;
1236     }
1237
1238   int misalign = dr_info->misalignment;
1239   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1240   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1241     return misalign;
1242
1243   /* If the access is only aligned for a vector type with smaller alignment
1244      requirement the access has unknown misalignment.  */
1245   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1246                 targetm.vectorize.preferred_vector_alignment (vectype)))
1247     return DR_MISALIGNMENT_UNKNOWN;
1248
1249   /* Apply the offset from the DR group start and the externally supplied
1250      offset which can for example result from a negative stride access.  */
1251   poly_int64 misalignment = misalign + diff + offset;
1252
1253   /* Below we reject compile-time non-constant target alignments, but if
1254      our misalignment is zero, then we are known to already be aligned
1255      w.r.t. any such possible target alignment.  */
1256   if (known_eq (misalignment, 0))
1257     return 0;
1258
1259   unsigned HOST_WIDE_INT target_alignment_c;
1260   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1261       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1262     return DR_MISALIGNMENT_UNKNOWN;
1263   return misalign;
1264 }
1265
1266 /* Record the base alignment guarantee given by DRB, which occurs
1267    in STMT_INFO.  */
1268
1269 static void
1270 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1271                             innermost_loop_behavior *drb)
1272 {
1273   bool existed;
1274   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1275     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1276   if (!existed || entry.second->base_alignment < drb->base_alignment)
1277     {
1278       entry = std::make_pair (stmt_info, drb);
1279       if (dump_enabled_p ())
1280         dump_printf_loc (MSG_NOTE, vect_location,
1281                          "recording new base alignment for %T\n"
1282                          "  alignment:    %d\n"
1283                          "  misalignment: %d\n"
1284                          "  based on:     %G",
1285                          drb->base_address,
1286                          drb->base_alignment,
1287                          drb->base_misalignment,
1288                          stmt_info->stmt);
1289     }
1290 }
1291
1292 /* If the region we're going to vectorize is reached, all unconditional
1293    data references occur at least once.  We can therefore pool the base
1294    alignment guarantees from each unconditional reference.  Do this by
1295    going through all the data references in VINFO and checking whether
1296    the containing statement makes the reference unconditionally.  If so,
1297    record the alignment of the base address in VINFO so that it can be
1298    used for all other references with the same base.  */
1299
1300 void
1301 vect_record_base_alignments (vec_info *vinfo)
1302 {
1303   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1304   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1305   for (data_reference *dr : vinfo->shared->datarefs)
1306     {
1307       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1308       stmt_vec_info stmt_info = dr_info->stmt;
1309       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1310           && STMT_VINFO_VECTORIZABLE (stmt_info)
1311           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1312         {
1313           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1314
1315           /* If DR is nested in the loop that is being vectorized, we can also
1316              record the alignment of the base wrt the outer loop.  */
1317           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1318             vect_record_base_alignment
1319               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1320         }
1321     }
1322 }
1323
1324 /* Function vect_compute_data_ref_alignment
1325
1326    Compute the misalignment of the data reference DR_INFO when vectorizing
1327    with VECTYPE.
1328
1329    RESULT is non-NULL iff VINFO is a loop_vec_info.  In that case, *RESULT will
1330    be set appropriately on failure (but is otherwise left unchanged).
1331
1332    Output:
1333    1. initialized misalignment info for DR_INFO
1334
1335    FOR NOW: No analysis is actually performed. Misalignment is calculated
1336    only for trivial cases. TODO.  */
1337
1338 static void
1339 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1340                                  tree vectype, opt_result *result = nullptr)
1341 {
1342   stmt_vec_info stmt_info = dr_info->stmt;
1343   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1344   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1345   class loop *loop = NULL;
1346   tree ref = DR_REF (dr_info->dr);
1347
1348   if (dump_enabled_p ())
1349     dump_printf_loc (MSG_NOTE, vect_location,
1350                      "vect_compute_data_ref_alignment:\n");
1351
1352   if (loop_vinfo)
1353     loop = LOOP_VINFO_LOOP (loop_vinfo);
1354
1355   /* Initialize misalignment to unknown.  */
1356   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1357
1358   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1359     return;
1360
1361   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1362   bool step_preserves_misalignment_p;
1363
1364   poly_uint64 vector_alignment
1365     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1366                  BITS_PER_UNIT);
1367
1368   /* If this DR needs peeling for alignment for correctness, we must
1369      ensure the target alignment is a constant power-of-two multiple of the
1370      amount read per vector iteration (overriding the above hook where
1371      necessary).  */
1372   if (dr_info->need_peeling_for_alignment)
1373     {
1374       /* Vector size in bytes.  */
1375       poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype));
1376
1377       /* We can only peel for loops, of course.  */
1378       gcc_checking_assert (loop_vinfo);
1379
1380       /* Calculate the number of vectors read per vector iteration.  If
1381          it is a power of two, multiply through to get the required
1382          alignment in bytes.  Otherwise, fail analysis since alignment
1383          peeling wouldn't work in such a case.  */
1384       poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1385       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1386         num_scalars *= DR_GROUP_SIZE (stmt_info);
1387
1388       auto num_vectors = vect_get_num_vectors (num_scalars, vectype);
1389       if (!pow2p_hwi (num_vectors))
1390         {
1391           *result = opt_result::failure_at (vect_location,
1392                                             "non-power-of-two num vectors %u "
1393                                             "for DR needing peeling for "
1394                                             "alignment at %G",
1395                                             num_vectors, stmt_info->stmt);
1396           return;
1397         }
1398
1399       safe_align *= num_vectors;
1400       if (maybe_gt (safe_align, 4096U))
1401         {
1402           pretty_printer pp;
1403           pp_wide_integer (&pp, safe_align);
1404           *result = opt_result::failure_at (vect_location,
1405                                             "alignment required for correctness"
1406                                             " (%s) may exceed page size",
1407                                             pp_formatted_text (&pp));
1408           return;
1409         }
1410
1411       unsigned HOST_WIDE_INT multiple;
1412       if (!constant_multiple_p (vector_alignment, safe_align, &multiple)
1413           || !pow2p_hwi (multiple))
1414         {
1415           if (dump_enabled_p ())
1416             {
1417               dump_printf_loc (MSG_NOTE, vect_location,
1418                                "forcing alignment for DR from preferred (");
1419               dump_dec (MSG_NOTE, vector_alignment);
1420               dump_printf (MSG_NOTE, ") to safe align (");
1421               dump_dec (MSG_NOTE, safe_align);
1422               dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt);
1423             }
1424           vector_alignment = safe_align;
1425         }
1426     }
1427
1428   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1429
1430   /* If the main loop has peeled for alignment we have no way of knowing
1431      whether the data accesses in the epilogues are aligned.  We can't at
1432      compile time answer the question whether we have entered the main loop or
1433      not.  Fixes PR 92351.  */
1434   if (loop_vinfo)
1435     {
1436       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1437       if (orig_loop_vinfo
1438           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1439         return;
1440     }
1441
1442   unsigned HOST_WIDE_INT vect_align_c;
1443   if (!vector_alignment.is_constant (&vect_align_c))
1444     return;
1445
1446   /* No step for BB vectorization.  */
1447   if (!loop)
1448     {
1449       gcc_assert (integer_zerop (drb->step));
1450       step_preserves_misalignment_p = true;
1451     }
1452
1453   else
1454     {
1455       /* We can only use base and misalignment information relative to
1456          an innermost loop if the misalignment stays the same throughout the
1457          execution of the loop.  As above, this is the case if the stride of
1458          the dataref evenly divides by the alignment.  */
1459       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460       step_preserves_misalignment_p
1461         = multiple_p (drb->step_alignment * vf, vect_align_c);
1462
1463       if (!step_preserves_misalignment_p && dump_enabled_p ())
1464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                          "step doesn't divide the vector alignment.\n");
1466
1467       /* In case the dataref is in an inner-loop of the loop that is being
1468          vectorized (LOOP), we use the base and misalignment information
1469          relative to the outer-loop (LOOP).  This is ok only if the
1470          misalignment stays the same throughout the execution of the
1471          inner-loop, which is why we have to check that the stride of the
1472          dataref in the inner-loop evenly divides by the vector alignment.  */
1473       if (step_preserves_misalignment_p
1474           && nested_in_vect_loop_p (loop, stmt_info))
1475         {
1476           step_preserves_misalignment_p
1477             = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1478
1479           if (dump_enabled_p ())
1480             {
1481               if (step_preserves_misalignment_p)
1482                 dump_printf_loc (MSG_NOTE, vect_location,
1483                                  "inner step divides the vector alignment.\n");
1484               else
1485                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486                                  "inner step doesn't divide the vector"
1487                                  " alignment.\n");
1488             }
1489         }
1490     }
1491
1492   unsigned int base_alignment = drb->base_alignment;
1493   unsigned int base_misalignment = drb->base_misalignment;
1494
1495   /* Calculate the maximum of the pooled base address alignment and the
1496      alignment that we can compute for DR itself.  */
1497   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1498     = base_alignments->get (drb->base_address);
1499   if (entry
1500       && base_alignment < (*entry).second->base_alignment
1501       && (loop_vinfo
1502           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1503                               gimple_bb (entry->first->stmt))
1504               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1505                   || (entry->first->dr_aux.group <= dr_info->group)))))
1506     {
1507       base_alignment = entry->second->base_alignment;
1508       base_misalignment = entry->second->base_misalignment;
1509     }
1510
1511   if (drb->offset_alignment < vect_align_c
1512       || !step_preserves_misalignment_p
1513       /* We need to know whether the step wrt the vectorized loop is
1514          negative when computing the starting misalignment below.  */
1515       || TREE_CODE (drb->step) != INTEGER_CST)
1516     {
1517       if (dump_enabled_p ())
1518         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519                          "Unknown alignment for access: %T\n", ref);
1520       return;
1521     }
1522
1523   if (base_alignment < vect_align_c)
1524     {
1525       unsigned int max_alignment;
1526       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1527       if (max_alignment < vect_align_c
1528           || !vect_can_force_dr_alignment_p (base,
1529                                              vect_align_c * BITS_PER_UNIT))
1530         {
1531           if (dump_enabled_p ())
1532             dump_printf_loc (MSG_NOTE, vect_location,
1533                              "can't force alignment of ref: %T\n", ref);
1534           return;
1535         }
1536
1537       /* Force the alignment of the decl.
1538          NOTE: This is the only change to the code we make during
1539          the analysis phase, before deciding to vectorize the loop.  */
1540       if (dump_enabled_p ())
1541         dump_printf_loc (MSG_NOTE, vect_location,
1542                          "force alignment of %T\n", ref);
1543
1544       dr_info->base_decl = base;
1545       dr_info->base_misaligned = true;
1546       base_misalignment = 0;
1547     }
1548   poly_int64 misalignment
1549     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1550
1551   unsigned int const_misalignment;
1552   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1553     {
1554       if (dump_enabled_p ())
1555         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                          "Non-constant misalignment for access: %T\n", ref);
1557       return;
1558     }
1559
1560   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1561
1562   if (dump_enabled_p ())
1563     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1564                      "misalign = %d bytes of ref %T\n",
1565                      const_misalignment, ref);
1566
1567   return;
1568 }
1569
1570 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1571    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1572    is made aligned via peeling.  */
1573
1574 static bool
1575 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1576                                          dr_vec_info *dr_peel_info)
1577 {
1578   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1579                   DR_TARGET_ALIGNMENT (dr_info)))
1580     {
1581       poly_offset_int diff
1582         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1583            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1584       if (known_eq (diff, 0)
1585           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1586         return true;
1587     }
1588   return false;
1589 }
1590
1591 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1592    aligned via peeling.  */
1593
1594 static bool
1595 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1596                                  dr_vec_info *dr_peel_info)
1597 {
1598   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1599                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1600       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1601                            DR_OFFSET (dr_peel_info->dr), 0)
1602       || !operand_equal_p (DR_STEP (dr_info->dr),
1603                            DR_STEP (dr_peel_info->dr), 0))
1604     return false;
1605
1606   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1607 }
1608
1609 /* Compute the value for dr_info->misalign so that the access appears
1610    aligned.  This is used by peeling to compensate for dr_misalignment
1611    applying the offset for negative step.  */
1612
1613 int
1614 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1615 {
1616   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1617     return 0;
1618
1619   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1620   poly_int64 misalignment
1621     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1622        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1623
1624   unsigned HOST_WIDE_INT target_alignment_c;
1625   int misalign;
1626   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1627       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1628     return DR_MISALIGNMENT_UNKNOWN;
1629   return misalign;
1630 }
1631
1632 /* Function vect_update_misalignment_for_peel.
1633    Sets DR_INFO's misalignment
1634    - to 0 if it has the same alignment as DR_PEEL_INFO,
1635    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1636    - to -1 (unknown) otherwise.
1637
1638    DR_INFO - the data reference whose misalignment is to be adjusted.
1639    DR_PEEL_INFO - the data reference whose misalignment is being made
1640                   zero in the vector loop by the peel.
1641    NPEEL - the number of iterations in the peel loop if the misalignment
1642            of DR_PEEL_INFO is known at compile time.  */
1643
1644 static void
1645 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1646                                    dr_vec_info *dr_peel_info, int npeel)
1647 {
1648   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1649   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1650     {
1651       SET_DR_MISALIGNMENT (dr_info,
1652                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1653       return;
1654     }
1655
1656   unsigned HOST_WIDE_INT alignment;
1657   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1658       && known_alignment_for_access_p (dr_info,
1659                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1660       && known_alignment_for_access_p (dr_peel_info,
1661                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1662     {
1663       int misal = dr_info->misalignment;
1664       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1665       misal &= alignment - 1;
1666       set_dr_misalignment (dr_info, misal);
1667       return;
1668     }
1669
1670   if (dump_enabled_p ())
1671     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1672                      "to unknown (-1).\n");
1673   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1674 }
1675
1676 /* Return true if alignment is relevant for DR_INFO.  */
1677
1678 static bool
1679 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1680 {
1681   stmt_vec_info stmt_info = dr_info->stmt;
1682
1683   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1684     return false;
1685
1686   /* For interleaving, only the alignment of the first access matters.  */
1687   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1688       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1689     return false;
1690
1691   /* Scatter-gather and invariant accesses continue to address individual
1692      scalars, so vector-level alignment is irrelevant.  */
1693   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1694       || integer_zerop (DR_STEP (dr_info->dr)))
1695     return false;
1696
1697   /* Strided accesses perform only component accesses, alignment is
1698      irrelevant for them.  */
1699   if (STMT_VINFO_STRIDED_P (stmt_info)
1700       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1701     return false;
1702
1703   return true;
1704 }
1705
1706 /* Given an memory reference EXP return whether its alignment is less
1707    than its size.  */
1708
1709 static bool
1710 not_size_aligned (tree exp)
1711 {
1712   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1713     return true;
1714
1715   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1716           > get_object_alignment (exp));
1717 }
1718
1719 /* Function vector_alignment_reachable_p
1720
1721    Return true if vector alignment for DR_INFO is reachable by peeling
1722    a few loop iterations.  Return false otherwise.  */
1723
1724 static bool
1725 vector_alignment_reachable_p (dr_vec_info *dr_info)
1726 {
1727   stmt_vec_info stmt_info = dr_info->stmt;
1728   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1729
1730   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1731     {
1732       /* For interleaved access we peel only if number of iterations in
1733          the prolog loop ({VF - misalignment}), is a multiple of the
1734          number of the interleaved accesses.  */
1735       int elem_size, mis_in_elements;
1736
1737       /* FORNOW: handle only known alignment.  */
1738       if (!known_alignment_for_access_p (dr_info, vectype))
1739         return false;
1740
1741       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1742       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1743       elem_size = vector_element_size (vector_size, nelements);
1744       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1745
1746       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1747         return false;
1748     }
1749
1750   /* If misalignment is known at the compile time then allow peeling
1751      only if natural alignment is reachable through peeling.  */
1752   if (known_alignment_for_access_p (dr_info, vectype)
1753       && !aligned_access_p (dr_info, vectype))
1754     {
1755       HOST_WIDE_INT elmsize =
1756                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1757       if (dump_enabled_p ())
1758         {
1759           dump_printf_loc (MSG_NOTE, vect_location,
1760                            "data size = %wd. misalignment = %d.\n", elmsize,
1761                            dr_misalignment (dr_info, vectype));
1762         }
1763       if (dr_misalignment (dr_info, vectype) % elmsize)
1764         {
1765           if (dump_enabled_p ())
1766             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1767                              "data size does not divide the misalignment.\n");
1768           return false;
1769         }
1770     }
1771
1772   if (!known_alignment_for_access_p (dr_info, vectype))
1773     {
1774       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1775       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1776       if (dump_enabled_p ())
1777         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778                          "Unknown misalignment, %snaturally aligned\n",
1779                          is_packed ? "not " : "");
1780       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1781     }
1782
1783   return true;
1784 }
1785
1786
1787 /* Calculate the cost of the memory access represented by DR_INFO.  */
1788
1789 static void
1790 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1791                            dr_alignment_support alignment_support_scheme,
1792                            int misalignment,
1793                            unsigned int *inside_cost,
1794                            unsigned int *outside_cost,
1795                            stmt_vector_for_cost *body_cost_vec,
1796                            stmt_vector_for_cost *prologue_cost_vec)
1797 {
1798   stmt_vec_info stmt_info = dr_info->stmt;
1799   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1800   int ncopies;
1801
1802   if (PURE_SLP_STMT (stmt_info))
1803     ncopies = 1;
1804   else
1805     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1806
1807   if (DR_IS_READ (dr_info->dr))
1808     vect_get_load_cost (vinfo, stmt_info, NULL, ncopies,
1809                         alignment_support_scheme, misalignment, true,
1810                         inside_cost, outside_cost, prologue_cost_vec,
1811                         body_cost_vec, false);
1812   else
1813     vect_get_store_cost (vinfo,stmt_info, NULL, ncopies,
1814                          alignment_support_scheme, misalignment, inside_cost,
1815                          body_cost_vec);
1816
1817   if (dump_enabled_p ())
1818     dump_printf_loc (MSG_NOTE, vect_location,
1819                      "vect_get_data_access_cost: inside_cost = %d, "
1820                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1821 }
1822
1823
1824 typedef struct _vect_peel_info
1825 {
1826   dr_vec_info *dr_info;
1827   int npeel;
1828   unsigned int count;
1829 } *vect_peel_info;
1830
1831 typedef struct _vect_peel_extended_info
1832 {
1833   vec_info *vinfo;
1834   struct _vect_peel_info peel_info;
1835   unsigned int inside_cost;
1836   unsigned int outside_cost;
1837 } *vect_peel_extended_info;
1838
1839
1840 /* Peeling hashtable helpers.  */
1841
1842 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1843 {
1844   static inline hashval_t hash (const _vect_peel_info *);
1845   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1846 };
1847
1848 inline hashval_t
1849 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1850 {
1851   return (hashval_t) peel_info->npeel;
1852 }
1853
1854 inline bool
1855 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1856 {
1857   return (a->npeel == b->npeel);
1858 }
1859
1860
1861 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1862
1863 static void
1864 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1865                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1866                           int npeel, bool supportable_if_not_aligned)
1867 {
1868   struct _vect_peel_info elem, *slot;
1869   _vect_peel_info **new_slot;
1870
1871   elem.npeel = npeel;
1872   slot = peeling_htab->find (&elem);
1873   if (slot)
1874     slot->count++;
1875   else
1876     {
1877       slot = XNEW (struct _vect_peel_info);
1878       slot->npeel = npeel;
1879       slot->dr_info = dr_info;
1880       slot->count = 1;
1881       new_slot = peeling_htab->find_slot (slot, INSERT);
1882       *new_slot = slot;
1883     }
1884
1885   /* If this DR is not supported with unknown misalignment then bias
1886      this slot when the cost model is disabled.  */
1887   if (!supportable_if_not_aligned
1888       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1889     slot->count += VECT_MAX_COST;
1890 }
1891
1892
1893 /* Traverse peeling hash table to find peeling option that aligns maximum
1894    number of data accesses.  */
1895
1896 int
1897 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1898                                      _vect_peel_extended_info *max)
1899 {
1900   vect_peel_info elem = *slot;
1901
1902   if (elem->count > max->peel_info.count
1903       || (elem->count == max->peel_info.count
1904           && max->peel_info.npeel > elem->npeel))
1905     {
1906       max->peel_info.npeel = elem->npeel;
1907       max->peel_info.count = elem->count;
1908       max->peel_info.dr_info = elem->dr_info;
1909     }
1910
1911   return 1;
1912 }
1913
1914 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1915    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1916    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1917    after peeling.  */
1918
1919 static void
1920 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1921                                 dr_vec_info *dr0_info,
1922                                 unsigned int *inside_cost,
1923                                 unsigned int *outside_cost,
1924                                 stmt_vector_for_cost *body_cost_vec,
1925                                 stmt_vector_for_cost *prologue_cost_vec,
1926                                 unsigned int npeel)
1927 {
1928   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1929
1930   bool dr0_alignment_known_p
1931     = (dr0_info
1932        && known_alignment_for_access_p (dr0_info,
1933                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1934
1935   for (data_reference *dr : datarefs)
1936     {
1937       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1938       if (!vect_relevant_for_alignment_p (dr_info))
1939         continue;
1940
1941       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1942       dr_alignment_support alignment_support_scheme;
1943       int misalignment;
1944       unsigned HOST_WIDE_INT alignment;
1945
1946       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1947                                             size_zero_node) < 0;
1948       poly_int64 off = 0;
1949       if (negative)
1950         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1951                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1952
1953       if (npeel == 0)
1954         misalignment = dr_misalignment (dr_info, vectype, off);
1955       else if (dr_info == dr0_info
1956                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1957         misalignment = 0;
1958       else if (!dr0_alignment_known_p
1959                || !known_alignment_for_access_p (dr_info, vectype)
1960                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1961         misalignment = DR_MISALIGNMENT_UNKNOWN;
1962       else
1963         {
1964           misalignment = dr_misalignment (dr_info, vectype, off);
1965           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1966           misalignment &= alignment - 1;
1967         }
1968       alignment_support_scheme
1969         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1970                                          misalignment);
1971
1972       vect_get_data_access_cost (loop_vinfo, dr_info,
1973                                  alignment_support_scheme, misalignment,
1974                                  inside_cost, outside_cost,
1975                                  body_cost_vec, prologue_cost_vec);
1976     }
1977 }
1978
1979 /* Traverse peeling hash table and calculate cost for each peeling option.
1980    Find the one with the lowest cost.  */
1981
1982 int
1983 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1984                                    _vect_peel_extended_info *min)
1985 {
1986   vect_peel_info elem = *slot;
1987   int dummy;
1988   unsigned int inside_cost = 0, outside_cost = 0;
1989   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1990   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1991                        epilogue_cost_vec;
1992
1993   prologue_cost_vec.create (2);
1994   body_cost_vec.create (2);
1995   epilogue_cost_vec.create (2);
1996
1997   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1998                                   &outside_cost, &body_cost_vec,
1999                                   &prologue_cost_vec, elem->npeel);
2000
2001   body_cost_vec.release ();
2002
2003   outside_cost += vect_get_known_peeling_cost
2004     (loop_vinfo, elem->npeel, &dummy,
2005      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006      &prologue_cost_vec, &epilogue_cost_vec);
2007
2008   /* Prologue and epilogue costs are added to the target model later.
2009      These costs depend only on the scalar iteration cost, the
2010      number of peeling iterations finally chosen, and the number of
2011      misaligned statements.  So discard the information found here.  */
2012   prologue_cost_vec.release ();
2013   epilogue_cost_vec.release ();
2014
2015   if (inside_cost < min->inside_cost
2016       || (inside_cost == min->inside_cost
2017           && outside_cost < min->outside_cost))
2018     {
2019       min->inside_cost = inside_cost;
2020       min->outside_cost = outside_cost;
2021       min->peel_info.dr_info = elem->dr_info;
2022       min->peel_info.npeel = elem->npeel;
2023       min->peel_info.count = elem->count;
2024     }
2025
2026   return 1;
2027 }
2028
2029
2030 /* Choose best peeling option by traversing peeling hash table and either
2031    choosing an option with the lowest cost (if cost model is enabled) or the
2032    option that aligns as many accesses as possible.  */
2033
2034 static struct _vect_peel_extended_info
2035 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
2036                                        loop_vec_info loop_vinfo)
2037 {
2038    struct _vect_peel_extended_info res;
2039
2040    res.peel_info.dr_info = NULL;
2041    res.vinfo = loop_vinfo;
2042
2043    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2044      {
2045        res.inside_cost = INT_MAX;
2046        res.outside_cost = INT_MAX;
2047        peeling_htab->traverse <_vect_peel_extended_info *,
2048                                vect_peeling_hash_get_lowest_cost> (&res);
2049      }
2050    else
2051      {
2052        res.peel_info.count = 0;
2053        peeling_htab->traverse <_vect_peel_extended_info *,
2054                                vect_peeling_hash_get_most_frequent> (&res);
2055        res.inside_cost = 0;
2056        res.outside_cost = 0;
2057      }
2058
2059    return res;
2060 }
2061
2062 /* Return true if the new peeling NPEEL is supported.  */
2063
2064 static bool
2065 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
2066                           unsigned npeel)
2067 {
2068   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2069   enum dr_alignment_support supportable_dr_alignment;
2070
2071   bool dr0_alignment_known_p
2072     = known_alignment_for_access_p (dr0_info,
2073                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
2074
2075   /* Ensure that all data refs can be vectorized after the peel.  */
2076   for (data_reference *dr : datarefs)
2077     {
2078       if (dr == dr0_info->dr)
2079         continue;
2080
2081       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2082       if (!vect_relevant_for_alignment_p (dr_info)
2083           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
2084         continue;
2085
2086       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2087       int misalignment;
2088       unsigned HOST_WIDE_INT alignment;
2089       if (!dr0_alignment_known_p
2090           || !known_alignment_for_access_p (dr_info, vectype)
2091           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2092         misalignment = DR_MISALIGNMENT_UNKNOWN;
2093       else
2094         {
2095           misalignment = dr_misalignment (dr_info, vectype);
2096           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2097           misalignment &= alignment - 1;
2098         }
2099       supportable_dr_alignment
2100         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2101                                          misalignment);
2102       if (supportable_dr_alignment == dr_unaligned_unsupported)
2103         return false;
2104     }
2105
2106   return true;
2107 }
2108
2109 /* Compare two data-references DRA and DRB to group them into chunks
2110    with related alignment.  */
2111
2112 static int
2113 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2114 {
2115   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2116   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2117   int cmp;
2118
2119   /* Stabilize sort.  */
2120   if (dra == drb)
2121     return 0;
2122
2123   /* Ordering of DRs according to base.  */
2124   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2125                                DR_BASE_ADDRESS (drb));
2126   if (cmp != 0)
2127     return cmp;
2128
2129   /* And according to DR_OFFSET.  */
2130   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2131   if (cmp != 0)
2132     return cmp;
2133
2134   /* And after step.  */
2135   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2136   if (cmp != 0)
2137     return cmp;
2138
2139   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2140   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2141   if (cmp == 0)
2142     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2143   return cmp;
2144 }
2145
2146 /* Function vect_enhance_data_refs_alignment
2147
2148    This pass will use loop versioning and loop peeling in order to enhance
2149    the alignment of data references in the loop.
2150
2151    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2152    original loop is to be vectorized.  Any other loops that are created by
2153    the transformations performed in this pass - are not supposed to be
2154    vectorized.  This restriction will be relaxed.
2155
2156    This pass will require a cost model to guide it whether to apply peeling
2157    or versioning or a combination of the two.  For example, the scheme that
2158    intel uses when given a loop with several memory accesses, is as follows:
2159    choose one memory access ('p') which alignment you want to force by doing
2160    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2161    other accesses are not necessarily aligned, or (2) use loop versioning to
2162    generate one loop in which all accesses are aligned, and another loop in
2163    which only 'p' is necessarily aligned.
2164
2165    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2166    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2167    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2168
2169    Devising a cost model is the most critical aspect of this work.  It will
2170    guide us on which access to peel for, whether to use loop versioning, how
2171    many versions to create, etc.  The cost model will probably consist of
2172    generic considerations as well as target specific considerations (on
2173    powerpc for example, misaligned stores are more painful than misaligned
2174    loads).
2175
2176    Here are the general steps involved in alignment enhancements:
2177
2178      -- original loop, before alignment analysis:
2179         for (i=0; i<N; i++){
2180           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2181           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2182         }
2183
2184      -- After vect_compute_data_refs_alignment:
2185         for (i=0; i<N; i++){
2186           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2187           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2188         }
2189
2190      -- Possibility 1: we do loop versioning:
2191      if (p is aligned) {
2192         for (i=0; i<N; i++){    # loop 1A
2193           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2194           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2195         }
2196      }
2197      else {
2198         for (i=0; i<N; i++){    # loop 1B
2199           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2200           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2201         }
2202      }
2203
2204      -- Possibility 2: we do loop peeling:
2205      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2206         x = q[i];
2207         p[i] = y;
2208      }
2209      for (i = 3; i < N; i++){   # loop 2A
2210         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2211         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2212      }
2213
2214      -- Possibility 3: combination of loop peeling and versioning:
2215      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2216         x = q[i];
2217         p[i] = y;
2218      }
2219      if (p is aligned) {
2220         for (i = 3; i<N; i++){  # loop 3A
2221           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2222           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2223         }
2224      }
2225      else {
2226         for (i = 3; i<N; i++){  # loop 3B
2227           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2228           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2229         }
2230      }
2231
2232      These loops are later passed to loop_transform to be vectorized.  The
2233      vectorizer will use the alignment information to guide the transformation
2234      (whether to generate regular loads/stores, or with special handling for
2235      misalignment).  */
2236
2237 opt_result
2238 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2239 {
2240   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2241   dr_vec_info *first_store = NULL;
2242   dr_vec_info *dr0_info = NULL;
2243   struct data_reference *dr;
2244   unsigned int i;
2245   bool do_peeling = false;
2246   bool do_versioning = false;
2247   unsigned int npeel = 0;
2248   bool one_misalignment_known = false;
2249   bool one_misalignment_unknown = false;
2250   bool one_dr_unsupportable = false;
2251   dr_vec_info *unsupportable_dr_info = NULL;
2252   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2253   hash_table<peel_info_hasher> peeling_htab (1);
2254
2255   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2256
2257   /* Reset data so we can safely be called multiple times.  */
2258   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2259   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2260
2261   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2262     return opt_result::success ();
2263
2264   /* Sort the vector of datarefs so DRs that have the same or dependent
2265      alignment are next to each other.  */
2266   auto_vec<data_reference_p> datarefs
2267     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2268   datarefs.qsort (dr_align_group_sort_cmp);
2269
2270   /* Compute the number of DRs that become aligned when we peel
2271      a dataref so it becomes aligned.  */
2272   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2273   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2274   unsigned i0;
2275   for (i0 = 0; i0 < datarefs.length (); ++i0)
2276     if (DR_BASE_ADDRESS (datarefs[i0]))
2277       break;
2278   for (i = i0 + 1; i <= datarefs.length (); ++i)
2279     {
2280       if (i == datarefs.length ()
2281           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2282                                DR_BASE_ADDRESS (datarefs[i]), 0)
2283           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2284                                DR_OFFSET (datarefs[i]), 0)
2285           || !operand_equal_p (DR_STEP (datarefs[i0]),
2286                                DR_STEP (datarefs[i]), 0))
2287         {
2288           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2289              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2290              will get known misalignment if we align one of the refs
2291              with the largest DR_TARGET_ALIGNMENT.  */
2292           for (unsigned j = i0; j < i; ++j)
2293             {
2294               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2295               for (unsigned k = i0; k < i; ++k)
2296                 {
2297                   if (k == j)
2298                     continue;
2299                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2300                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2301                                                                dr_infoj))
2302                     n_same_align_refs[j]++;
2303                 }
2304             }
2305           i0 = i;
2306         }
2307     }
2308
2309   /* While cost model enhancements are expected in the future, the high level
2310      view of the code at this time is as follows:
2311
2312      A) If there is a misaligned access then see if peeling to align
2313         this access can make all data references satisfy
2314         vect_supportable_dr_alignment.  If so, update data structures
2315         as needed and return true.
2316
2317      B) If peeling wasn't possible and there is a data reference with an
2318         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2319         then see if loop versioning checks can be used to make all data
2320         references satisfy vect_supportable_dr_alignment.  If so, update
2321         data structures as needed and return true.
2322
2323      C) If neither peeling nor versioning were successful then return false if
2324         any data reference does not satisfy vect_supportable_dr_alignment.
2325
2326      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2327
2328      Note, Possibility 3 above (which is peeling and versioning together) is not
2329      being done at this time.  */
2330
2331   /* (1) Peeling to force alignment.  */
2332
2333   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2334      Considerations:
2335      + How many accesses will become aligned due to the peeling
2336      - How many accesses will become unaligned due to the peeling,
2337        and the cost of misaligned accesses.
2338      - The cost of peeling (the extra runtime checks, the increase
2339        in code size).  */
2340
2341   FOR_EACH_VEC_ELT (datarefs, i, dr)
2342     {
2343       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2344       if (!vect_relevant_for_alignment_p (dr_info))
2345         continue;
2346
2347       stmt_vec_info stmt_info = dr_info->stmt;
2348       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2349       do_peeling = vector_alignment_reachable_p (dr_info);
2350       if (do_peeling)
2351         {
2352           if (known_alignment_for_access_p (dr_info, vectype))
2353             {
2354               unsigned int npeel_tmp = 0;
2355               bool negative = tree_int_cst_compare (DR_STEP (dr),
2356                                                     size_zero_node) < 0;
2357
2358               /* If known_alignment_for_access_p then we have set
2359                  DR_MISALIGNMENT which is only done if we know it at compiler
2360                  time, so it is safe to assume target alignment is constant.
2361                */
2362               unsigned int target_align =
2363                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2364               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2365               poly_int64 off = 0;
2366               if (negative)
2367                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2368               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2369               mis = negative ? mis : -mis;
2370               if (mis != 0)
2371                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2372
2373               /* For multiple types, it is possible that the bigger type access
2374                  will have more than one peeling option.  E.g., a loop with two
2375                  types: one of size (vector size / 4), and the other one of
2376                  size (vector size / 8).  Vectorization factor will 8.  If both
2377                  accesses are misaligned by 3, the first one needs one scalar
2378                  iteration to be aligned, and the second one needs 5.  But the
2379                  first one will be aligned also by peeling 5 scalar
2380                  iterations, and in that case both accesses will be aligned.
2381                  Hence, except for the immediate peeling amount, we also want
2382                  to try to add full vector size, while we don't exceed
2383                  vectorization factor.
2384                  We do this automatically for cost model, since we calculate
2385                  cost for every peeling option.  */
2386               poly_uint64 nscalars = npeel_tmp;
2387               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2388                 {
2389                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2390                   unsigned group_size = 1;
2391                   if (STMT_SLP_TYPE (stmt_info)
2392                       && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2393                     group_size = DR_GROUP_SIZE (stmt_info);
2394                   nscalars = vf * group_size;
2395                 }
2396
2397               /* Save info about DR in the hash table.  Also include peeling
2398                  amounts according to the explanation above.  Indicate
2399                  the alignment status when the ref is not aligned.
2400                  ???  Rather than using unknown alignment here we should
2401                  prune all entries from the peeling hashtable which cause
2402                  DRs to be not supported.  */
2403               bool supportable_if_not_aligned
2404                 = vect_supportable_dr_alignment
2405                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2406               while (known_le (npeel_tmp, nscalars))
2407                 {
2408                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2409                                             dr_info, npeel_tmp,
2410                                             supportable_if_not_aligned);
2411                   npeel_tmp += MAX (1, target_align / dr_size);
2412                 }
2413
2414               one_misalignment_known = true;
2415             }
2416           else
2417             {
2418               /* If we don't know any misalignment values, we prefer
2419                  peeling for data-ref that has the maximum number of data-refs
2420                  with the same alignment, unless the target prefers to align
2421                  stores over load.  */
2422               unsigned same_align_drs = n_same_align_refs[i];
2423               if (!dr0_info
2424                   || dr0_same_align_drs < same_align_drs)
2425                 {
2426                   dr0_same_align_drs = same_align_drs;
2427                   dr0_info = dr_info;
2428                 }
2429               /* For data-refs with the same number of related
2430                  accesses prefer the one where the misalign
2431                  computation will be invariant in the outermost loop.  */
2432               else if (dr0_same_align_drs == same_align_drs)
2433                 {
2434                   class loop *ivloop0, *ivloop;
2435                   ivloop0 = outermost_invariant_loop_for_expr
2436                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2437                   ivloop = outermost_invariant_loop_for_expr
2438                     (loop, DR_BASE_ADDRESS (dr));
2439                   if ((ivloop && !ivloop0)
2440                       || (ivloop && ivloop0
2441                           && flow_loop_nested_p (ivloop, ivloop0)))
2442                     dr0_info = dr_info;
2443                 }
2444
2445               one_misalignment_unknown = true;
2446
2447               /* Check for data refs with unsupportable alignment that
2448                  can be peeled.  */
2449               enum dr_alignment_support supportable_dr_alignment
2450                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2451                                                  DR_MISALIGNMENT_UNKNOWN);
2452               if (supportable_dr_alignment == dr_unaligned_unsupported)
2453                 {
2454                   one_dr_unsupportable = true;
2455                   unsupportable_dr_info = dr_info;
2456                 }
2457
2458               if (!first_store && DR_IS_WRITE (dr))
2459                 {
2460                   first_store = dr_info;
2461                   first_store_same_align_drs = same_align_drs;
2462                 }
2463             }
2464         }
2465       else
2466         {
2467           if (!aligned_access_p (dr_info, vectype))
2468             {
2469               if (dump_enabled_p ())
2470                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471                                  "vector alignment may not be reachable\n");
2472               break;
2473             }
2474         }
2475     }
2476
2477   /* Check if we can possibly peel the loop.  */
2478   if (!vect_can_advance_ivs_p (loop_vinfo)
2479       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2480                                        loop_preheader_edge (loop))
2481       || loop->inner
2482       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2483     do_peeling = false;
2484
2485   struct _vect_peel_extended_info peel_for_known_alignment;
2486   struct _vect_peel_extended_info peel_for_unknown_alignment;
2487   struct _vect_peel_extended_info best_peel;
2488
2489   peel_for_unknown_alignment.inside_cost = INT_MAX;
2490   peel_for_unknown_alignment.outside_cost = INT_MAX;
2491   peel_for_unknown_alignment.peel_info.count = 0;
2492
2493   if (do_peeling
2494       && one_misalignment_unknown)
2495     {
2496       /* Check if the target requires to prefer stores over loads, i.e., if
2497          misaligned stores are more expensive than misaligned loads (taking
2498          drs with same alignment into account).  */
2499       unsigned int load_inside_cost = 0;
2500       unsigned int load_outside_cost = 0;
2501       unsigned int store_inside_cost = 0;
2502       unsigned int store_outside_cost = 0;
2503       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2504
2505       stmt_vector_for_cost dummy;
2506       dummy.create (2);
2507       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2508                                       &load_inside_cost,
2509                                       &load_outside_cost,
2510                                       &dummy, &dummy, estimated_npeels);
2511       dummy.release ();
2512
2513       if (first_store)
2514         {
2515           dummy.create (2);
2516           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2517                                           &store_inside_cost,
2518                                           &store_outside_cost,
2519                                           &dummy, &dummy,
2520                                           estimated_npeels);
2521           dummy.release ();
2522         }
2523       else
2524         {
2525           store_inside_cost = INT_MAX;
2526           store_outside_cost = INT_MAX;
2527         }
2528
2529       if (load_inside_cost > store_inside_cost
2530           || (load_inside_cost == store_inside_cost
2531               && load_outside_cost > store_outside_cost))
2532         {
2533           dr0_info = first_store;
2534           dr0_same_align_drs = first_store_same_align_drs;
2535           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2536           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2537         }
2538       else
2539         {
2540           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2541           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2542         }
2543
2544       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2545       prologue_cost_vec.create (2);
2546       epilogue_cost_vec.create (2);
2547
2548       int dummy2;
2549       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2550         (loop_vinfo, estimated_npeels, &dummy2,
2551          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2552          &prologue_cost_vec, &epilogue_cost_vec);
2553
2554       prologue_cost_vec.release ();
2555       epilogue_cost_vec.release ();
2556
2557       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2558     }
2559
2560   peel_for_unknown_alignment.peel_info.npeel = 0;
2561   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2562
2563   best_peel = peel_for_unknown_alignment;
2564
2565   peel_for_known_alignment.inside_cost = INT_MAX;
2566   peel_for_known_alignment.outside_cost = INT_MAX;
2567   peel_for_known_alignment.peel_info.count = 0;
2568   peel_for_known_alignment.peel_info.dr_info = NULL;
2569
2570   if (do_peeling && one_misalignment_known)
2571     {
2572       /* Peeling is possible, but there is no data access that is not supported
2573          unless aligned.  So we try to choose the best possible peeling from
2574          the hash table.  */
2575       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2576         (&peeling_htab, loop_vinfo);
2577     }
2578
2579   /* Compare costs of peeling for known and unknown alignment. */
2580   if (peel_for_known_alignment.peel_info.dr_info != NULL
2581       && peel_for_unknown_alignment.inside_cost
2582       >= peel_for_known_alignment.inside_cost)
2583     {
2584       best_peel = peel_for_known_alignment;
2585
2586       /* If the best peeling for known alignment has NPEEL == 0, perform no
2587          peeling at all except if there is an unsupportable dr that we can
2588          align.  */
2589       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2590         do_peeling = false;
2591     }
2592
2593   /* If there is an unsupportable data ref, prefer this over all choices so far
2594      since we'd have to discard a chosen peeling except when it accidentally
2595      aligned the unsupportable data ref.  */
2596   if (one_dr_unsupportable)
2597     dr0_info = unsupportable_dr_info;
2598   else if (do_peeling)
2599     {
2600       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2601          TODO: Use nopeel_outside_cost or get rid of it?  */
2602       unsigned nopeel_inside_cost = 0;
2603       unsigned nopeel_outside_cost = 0;
2604
2605       stmt_vector_for_cost dummy;
2606       dummy.create (2);
2607       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2608                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2609       dummy.release ();
2610
2611       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2612          costs will be recorded.  */
2613       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2614       prologue_cost_vec.create (2);
2615       epilogue_cost_vec.create (2);
2616
2617       int dummy2;
2618       nopeel_outside_cost += vect_get_known_peeling_cost
2619         (loop_vinfo, 0, &dummy2,
2620          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2621          &prologue_cost_vec, &epilogue_cost_vec);
2622
2623       prologue_cost_vec.release ();
2624       epilogue_cost_vec.release ();
2625
2626       npeel = best_peel.peel_info.npeel;
2627       dr0_info = best_peel.peel_info.dr_info;
2628
2629       /* If no peeling is not more expensive than the best peeling we
2630          have so far, don't perform any peeling.  */
2631       if (nopeel_inside_cost <= best_peel.inside_cost)
2632         do_peeling = false;
2633     }
2634
2635   if (do_peeling)
2636     {
2637       stmt_vec_info stmt_info = dr0_info->stmt;
2638       if (known_alignment_for_access_p (dr0_info,
2639                                         STMT_VINFO_VECTYPE (stmt_info)))
2640         {
2641           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2642                                                 size_zero_node) < 0;
2643           if (!npeel)
2644             {
2645               /* Since it's known at compile time, compute the number of
2646                  iterations in the peeled loop (the peeling factor) for use in
2647                  updating DR_MISALIGNMENT values.  The peeling factor is the
2648                  vectorization factor minus the misalignment as an element
2649                  count.  */
2650               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2651               poly_int64 off = 0;
2652               if (negative)
2653                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2654                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2655               unsigned int mis
2656                 = dr_misalignment (dr0_info, vectype, off);
2657               mis = negative ? mis : -mis;
2658               /* If known_alignment_for_access_p then we have set
2659                  DR_MISALIGNMENT which is only done if we know it at compiler
2660                  time, so it is safe to assume target alignment is constant.
2661                */
2662               unsigned int target_align =
2663                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2664               npeel = ((mis & (target_align - 1))
2665                        / vect_get_scalar_dr_size (dr0_info));
2666             }
2667
2668           /* For interleaved data access every iteration accesses all the
2669              members of the group, therefore we divide the number of iterations
2670              by the group size.  */
2671           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2672             npeel /= DR_GROUP_SIZE (stmt_info);
2673
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "Try peeling by %d\n", npeel);
2677         }
2678
2679       /* Ensure that all datarefs can be vectorized after the peel.  */
2680       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2681         do_peeling = false;
2682
2683       /* Check if all datarefs are supportable and log.  */
2684       if (do_peeling
2685           && npeel == 0
2686           && known_alignment_for_access_p (dr0_info,
2687                                            STMT_VINFO_VECTYPE (stmt_info)))
2688         return opt_result::success ();
2689
2690       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2691       if (do_peeling)
2692         {
2693           unsigned max_allowed_peel
2694             = param_vect_max_peeling_for_alignment;
2695           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2696             max_allowed_peel = 0;
2697           if (max_allowed_peel != (unsigned)-1)
2698             {
2699               unsigned max_peel = npeel;
2700               if (max_peel == 0)
2701                 {
2702                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2703                   unsigned HOST_WIDE_INT target_align_c;
2704                   if (target_align.is_constant (&target_align_c))
2705                     max_peel =
2706                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2707                   else
2708                     {
2709                       do_peeling = false;
2710                       if (dump_enabled_p ())
2711                         dump_printf_loc (MSG_NOTE, vect_location,
2712                           "Disable peeling, max peels set and vector"
2713                           " alignment unknown\n");
2714                     }
2715                 }
2716               if (max_peel > max_allowed_peel)
2717                 {
2718                   do_peeling = false;
2719                   if (dump_enabled_p ())
2720                     dump_printf_loc (MSG_NOTE, vect_location,
2721                         "Disable peeling, max peels reached: %d\n", max_peel);
2722                 }
2723             }
2724         }
2725
2726       /* Cost model #2 - if peeling may result in a remaining loop not
2727          iterating enough to be vectorized then do not peel.  Since this
2728          is a cost heuristic rather than a correctness decision, use the
2729          most likely runtime value for variable vectorization factors.  */
2730       if (do_peeling
2731           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2732         {
2733           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2734           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2735           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2736               < assumed_vf + max_peel)
2737             do_peeling = false;
2738         }
2739
2740       if (do_peeling)
2741         {
2742           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2743              If the misalignment of DR_i is identical to that of dr0 then set
2744              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2745              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2746              by the peeling factor times the element size of DR_i (MOD the
2747              vectorization factor times the size).  Otherwise, the
2748              misalignment of DR_i must be set to unknown.  */
2749           FOR_EACH_VEC_ELT (datarefs, i, dr)
2750             if (dr != dr0_info->dr)
2751               {
2752                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2753                 if (!vect_relevant_for_alignment_p (dr_info))
2754                   continue;
2755
2756                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2757               }
2758
2759           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2760           if (npeel)
2761             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2762           else
2763             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2764           SET_DR_MISALIGNMENT (dr0_info,
2765                                vect_dr_misalign_for_aligned_access (dr0_info));
2766           if (dump_enabled_p ())
2767             {
2768               dump_printf_loc (MSG_NOTE, vect_location,
2769                                "Alignment of access forced using peeling.\n");
2770               dump_printf_loc (MSG_NOTE, vect_location,
2771                                "Peeling for alignment will be applied.\n");
2772             }
2773
2774           /* The inside-loop cost will be accounted for in vectorizable_load
2775              and vectorizable_store correctly with adjusted alignments.
2776              Drop the body_cst_vec on the floor here.  */
2777           return opt_result::success ();
2778         }
2779     }
2780
2781   /* (2) Versioning to force alignment.  */
2782
2783   /* Try versioning if:
2784      1) optimize loop for speed and the cost-model is not cheap
2785      2) there is at least one unsupported misaligned data ref with an unknown
2786         misalignment, and
2787      3) all misaligned data refs with a known misalignment are supported, and
2788      4) the number of runtime alignment checks is within reason.  */
2789
2790   do_versioning
2791     = (optimize_loop_nest_for_speed_p (loop)
2792        && !loop->inner /* FORNOW */
2793        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2794
2795   if (do_versioning)
2796     {
2797       FOR_EACH_VEC_ELT (datarefs, i, dr)
2798         {
2799           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2800           if (!vect_relevant_for_alignment_p (dr_info))
2801             continue;
2802
2803           stmt_vec_info stmt_info = dr_info->stmt;
2804           if (STMT_VINFO_STRIDED_P (stmt_info))
2805             {
2806               do_versioning = false;
2807               break;
2808             }
2809
2810           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2811           bool negative = tree_int_cst_compare (DR_STEP (dr),
2812                                                 size_zero_node) < 0;
2813           poly_int64 off = 0;
2814           if (negative)
2815             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2816                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2817           int misalignment;
2818           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2819             continue;
2820
2821           enum dr_alignment_support supportable_dr_alignment
2822             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2823                                              misalignment);
2824           if (supportable_dr_alignment == dr_unaligned_unsupported)
2825             {
2826               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2827                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2828                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2829                 {
2830                   do_versioning = false;
2831                   break;
2832                 }
2833
2834               /* At present we don't support versioning for alignment
2835                  with variable VF, since there's no guarantee that the
2836                  VF is a power of two.  We could relax this if we added
2837                  a way of enforcing a power-of-two size.  */
2838               unsigned HOST_WIDE_INT size;
2839               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2840                 {
2841                   do_versioning = false;
2842                   break;
2843                 }
2844
2845               /* Forcing alignment in the first iteration is no good if
2846                  we don't keep it across iterations.  For now, just disable
2847                  versioning in this case.
2848                  ?? We could actually unroll the loop to achieve the required
2849                  overall step alignment, and forcing the alignment could be
2850                  done by doing some iterations of the non-vectorized loop.  */
2851               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2852                                * DR_STEP_ALIGNMENT (dr),
2853                                DR_TARGET_ALIGNMENT (dr_info)))
2854                 {
2855                   do_versioning = false;
2856                   break;
2857                 }
2858
2859               /* The rightmost bits of an aligned address must be zeros.
2860                  Construct the mask needed for this test.  For example,
2861                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2862                  mask must be 15 = 0xf. */
2863               int mask = size - 1;
2864
2865               /* FORNOW: use the same mask to test all potentially unaligned
2866                  references in the loop.  */
2867               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2868                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2869                 {
2870                   do_versioning = false;
2871                   break;
2872                 }
2873
2874               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2875               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2876             }
2877         }
2878
2879       /* Versioning requires at least one misaligned data reference.  */
2880       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2881         do_versioning = false;
2882       else if (!do_versioning)
2883         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2884     }
2885
2886   if (do_versioning)
2887     {
2888       const vec<stmt_vec_info> &may_misalign_stmts
2889         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2890       stmt_vec_info stmt_info;
2891
2892       /* It can now be assumed that the data references in the statements
2893          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2894          of the loop being vectorized.  */
2895       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2896         {
2897           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2898           SET_DR_MISALIGNMENT (dr_info,
2899                                vect_dr_misalign_for_aligned_access (dr_info));
2900           if (dump_enabled_p ())
2901             dump_printf_loc (MSG_NOTE, vect_location,
2902                              "Alignment of access forced using versioning.\n");
2903         }
2904
2905       if (dump_enabled_p ())
2906         dump_printf_loc (MSG_NOTE, vect_location,
2907                          "Versioning for alignment will be applied.\n");
2908
2909       /* Peeling and versioning can't be done together at this time.  */
2910       gcc_assert (! (do_peeling && do_versioning));
2911
2912       return opt_result::success ();
2913     }
2914
2915   /* This point is reached if neither peeling nor versioning is being done.  */
2916   gcc_assert (! (do_peeling || do_versioning));
2917
2918   return opt_result::success ();
2919 }
2920
2921
2922 /* Function vect_analyze_data_refs_alignment
2923
2924    Analyze the alignment of the data-references in the loop.
2925    Return FALSE if a data reference is found that cannot be vectorized.  */
2926
2927 opt_result
2928 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2929 {
2930   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2931
2932   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2933   struct data_reference *dr;
2934   unsigned int i;
2935
2936   vect_record_base_alignments (loop_vinfo);
2937   FOR_EACH_VEC_ELT (datarefs, i, dr)
2938     {
2939       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2940       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2941         {
2942           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2943               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2944             continue;
2945           opt_result res = opt_result::success ();
2946           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2947                                            STMT_VINFO_VECTYPE (dr_info->stmt),
2948                                            &res);
2949           if (!res)
2950             return res;
2951         }
2952     }
2953
2954   return opt_result::success ();
2955 }
2956
2957
2958 /* Analyze alignment of DRs of stmts in NODE.  */
2959
2960 static bool
2961 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2962 {
2963   /* Alignment is maintained in the first element of the group.  */
2964   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2965   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2966   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2967   tree vectype = SLP_TREE_VECTYPE (node);
2968   poly_uint64 vector_alignment
2969     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2970                  BITS_PER_UNIT);
2971   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2972     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2973   /* Re-analyze alignment when we're facing a vectorization with a bigger
2974      alignment requirement.  */
2975   else if (known_lt (dr_info->target_alignment, vector_alignment))
2976     {
2977       poly_uint64 old_target_alignment = dr_info->target_alignment;
2978       int old_misalignment = dr_info->misalignment;
2979       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2980       /* But keep knowledge about a smaller alignment.  */
2981       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2982           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2983         {
2984           dr_info->target_alignment = old_target_alignment;
2985           dr_info->misalignment = old_misalignment;
2986         }
2987     }
2988   /* When we ever face unordered target alignments the first one wins in terms
2989      of analyzing and the other will become unknown in dr_misalignment.  */
2990   return true;
2991 }
2992
2993 /* Function vect_slp_analyze_instance_alignment
2994
2995    Analyze the alignment of the data-references in the SLP instance.
2996    Return FALSE if a data reference is found that cannot be vectorized.  */
2997
2998 bool
2999 vect_slp_analyze_instance_alignment (vec_info *vinfo,
3000                                                 slp_instance instance)
3001 {
3002   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
3003
3004   slp_tree node;
3005   unsigned i;
3006   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
3007     if (! vect_slp_analyze_node_alignment (vinfo, node))
3008       return false;
3009
3010   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
3011       && ! vect_slp_analyze_node_alignment
3012              (vinfo, SLP_INSTANCE_TREE (instance)))
3013     return false;
3014
3015   return true;
3016 }
3017
3018
3019 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3020    accesses of legal size, step, etc.  Detect gaps, single element
3021    interleaving, and other special cases. Set grouped access info.
3022    Collect groups of strided stores for further use in SLP analysis.
3023    Worker for vect_analyze_group_access.  */
3024
3025 static bool
3026 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
3027 {
3028   data_reference *dr = dr_info->dr;
3029   tree step = DR_STEP (dr);
3030   tree scalar_type = TREE_TYPE (DR_REF (dr));
3031   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
3032   stmt_vec_info stmt_info = dr_info->stmt;
3033   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3034   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3035   HOST_WIDE_INT dr_step = -1;
3036   HOST_WIDE_INT groupsize, last_accessed_element = 1;
3037   bool slp_impossible = false;
3038
3039   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
3040      size of the interleaving group (including gaps).  */
3041   if (tree_fits_shwi_p (step))
3042     {
3043       dr_step = tree_to_shwi (step);
3044       /* Check that STEP is a multiple of type size.  Otherwise there is
3045          a non-element-sized gap at the end of the group which we
3046          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
3047          ???  As we can handle non-constant step fine here we should
3048          simply remove uses of DR_GROUP_GAP between the last and first
3049          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
3050          simply not include that gap.  */
3051       if ((dr_step % type_size) != 0)
3052         {
3053           if (dump_enabled_p ())
3054             dump_printf_loc (MSG_NOTE, vect_location,
3055                              "Step %T is not a multiple of the element size"
3056                              " for %T\n",
3057                              step, DR_REF (dr));
3058           return false;
3059         }
3060       groupsize = absu_hwi (dr_step) / type_size;
3061     }
3062   else
3063     groupsize = 0;
3064
3065   /* Not consecutive access is possible only if it is a part of interleaving.  */
3066   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
3067     {
3068       /* Check if it this DR is a part of interleaving, and is a single
3069          element of the group that is accessed in the loop.  */
3070
3071       /* Gaps are supported only for loads. STEP must be a multiple of the type
3072          size.  */
3073       if (DR_IS_READ (dr)
3074           && (dr_step % type_size) == 0
3075           && groupsize > 0
3076           /* This could be UINT_MAX but as we are generating code in a very
3077              inefficient way we have to cap earlier.
3078              See PR91403 for example.  */
3079           && groupsize <= 4096)
3080         {
3081           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
3082           DR_GROUP_SIZE (stmt_info) = groupsize;
3083           DR_GROUP_GAP (stmt_info) = groupsize - 1;
3084           if (dump_enabled_p ())
3085             dump_printf_loc (MSG_NOTE, vect_location,
3086                              "Detected single element interleaving %T"
3087                              " step %T\n",
3088                              DR_REF (dr), step);
3089
3090           return true;
3091         }
3092
3093       if (dump_enabled_p ())
3094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095                          "not consecutive access %G", stmt_info->stmt);
3096
3097       if (bb_vinfo)
3098         {
3099           /* Mark the statement as unvectorizable.  */
3100           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3101           return true;
3102         }
3103
3104       if (dump_enabled_p ())
3105         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3106       STMT_VINFO_STRIDED_P (stmt_info) = true;
3107       return true;
3108     }
3109
3110   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3111     {
3112       /* First stmt in the interleaving chain. Check the chain.  */
3113       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3114       struct data_reference *data_ref = dr;
3115       unsigned int count = 1;
3116       tree prev_init = DR_INIT (data_ref);
3117       HOST_WIDE_INT diff, gaps = 0;
3118
3119       /* By construction, all group members have INTEGER_CST DR_INITs.  */
3120       while (next)
3121         {
3122           /* We never have the same DR multiple times.  */
3123           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3124                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3125
3126           data_ref = STMT_VINFO_DATA_REF (next);
3127
3128           /* All group members have the same STEP by construction.  */
3129           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3130
3131           /* Check that the distance between two accesses is equal to the type
3132              size. Otherwise, we have gaps.  */
3133           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3134                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3135           if (diff < 1 || diff > UINT_MAX)
3136             {
3137               /* For artificial testcases with array accesses with large
3138                  constant indices we can run into overflow issues which
3139                  can end up fooling the groupsize constraint below so
3140                  check the individual gaps (which are represented as
3141                  unsigned int) as well.  */
3142               if (dump_enabled_p ())
3143                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144                                  "interleaved access with gap larger "
3145                                  "than representable\n");
3146               return false;
3147             }
3148           if (diff != 1)
3149             {
3150               /* FORNOW: SLP of accesses with gaps is not supported.  */
3151               slp_impossible = true;
3152               if (DR_IS_WRITE (data_ref))
3153                 {
3154                   if (dump_enabled_p ())
3155                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156                                      "interleaved store with gaps\n");
3157                   return false;
3158                 }
3159
3160               gaps += diff - 1;
3161             }
3162
3163           last_accessed_element += diff;
3164
3165           /* Store the gap from the previous member of the group. If there is no
3166              gap in the access, DR_GROUP_GAP is always 1.  */
3167           DR_GROUP_GAP (next) = diff;
3168
3169           prev_init = DR_INIT (data_ref);
3170           next = DR_GROUP_NEXT_ELEMENT (next);
3171           /* Count the number of data-refs in the chain.  */
3172           count++;
3173         }
3174
3175       if (groupsize == 0)
3176         groupsize = count + gaps;
3177
3178       /* This could be UINT_MAX but as we are generating code in a very
3179          inefficient way we have to cap earlier.  See PR78699 for example.  */
3180       if (groupsize > 4096)
3181         {
3182           if (dump_enabled_p ())
3183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3184                              "group is too large\n");
3185           return false;
3186         }
3187
3188       /* Check that the size of the interleaving is equal to count for stores,
3189          i.e., that there are no gaps.  */
3190       if (groupsize != count
3191           && !DR_IS_READ (dr))
3192         {
3193           groupsize = count;
3194           STMT_VINFO_STRIDED_P (stmt_info) = true;
3195         }
3196
3197       /* If there is a gap after the last load in the group it is the
3198          difference between the groupsize and the last accessed
3199          element.
3200          When there is no gap, this difference should be 0.  */
3201       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3202
3203       DR_GROUP_SIZE (stmt_info) = groupsize;
3204       if (dump_enabled_p ())
3205         {
3206           dump_printf_loc (MSG_NOTE, vect_location,
3207                            "Detected interleaving ");
3208           if (DR_IS_READ (dr))
3209             dump_printf (MSG_NOTE, "load ");
3210           else if (STMT_VINFO_STRIDED_P (stmt_info))
3211             dump_printf (MSG_NOTE, "strided store ");
3212           else
3213             dump_printf (MSG_NOTE, "store ");
3214           dump_printf (MSG_NOTE, "of size %u\n",
3215                        (unsigned)groupsize);
3216           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3217           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3218           while (next)
3219             {
3220               if (DR_GROUP_GAP (next) != 1)
3221                 dump_printf_loc (MSG_NOTE, vect_location,
3222                                  "\t<gap of %d elements>\n",
3223                                  DR_GROUP_GAP (next) - 1);
3224               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3225               next = DR_GROUP_NEXT_ELEMENT (next);
3226             }
3227           if (DR_GROUP_GAP (stmt_info) != 0)
3228             dump_printf_loc (MSG_NOTE, vect_location,
3229                              "\t<gap of %d elements>\n",
3230                              DR_GROUP_GAP (stmt_info));
3231         }
3232
3233       /* SLP: create an SLP data structure for every interleaving group of
3234          stores for further analysis in vect_analyse_slp.  */
3235       if (DR_IS_WRITE (dr) && !slp_impossible)
3236         {
3237           if (loop_vinfo)
3238             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3239           if (bb_vinfo)
3240             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3241         }
3242     }
3243
3244   return true;
3245 }
3246
3247 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3248    accesses of legal size, step, etc.  Detect gaps, single element
3249    interleaving, and other special cases. Set grouped access info.
3250    Collect groups of strided stores for further use in SLP analysis.  */
3251
3252 static bool
3253 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3254 {
3255   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3256     {
3257       /* Dissolve the group if present.  */
3258       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3259       while (stmt_info)
3260         {
3261           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3262           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3263           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3264           stmt_info = next;
3265         }
3266       return false;
3267     }
3268   return true;
3269 }
3270
3271 /* Analyze the access pattern of the data-reference DR_INFO.
3272    In case of non-consecutive accesses call vect_analyze_group_access() to
3273    analyze groups of accesses.  */
3274
3275 static bool
3276 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3277 {
3278   data_reference *dr = dr_info->dr;
3279   tree step = DR_STEP (dr);
3280   tree scalar_type = TREE_TYPE (DR_REF (dr));
3281   stmt_vec_info stmt_info = dr_info->stmt;
3282   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3283   class loop *loop = NULL;
3284
3285   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3286     return true;
3287
3288   if (loop_vinfo)
3289     loop = LOOP_VINFO_LOOP (loop_vinfo);
3290
3291   if (loop_vinfo && !step)
3292     {
3293       if (dump_enabled_p ())
3294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3295                          "bad data-ref access in loop\n");
3296       return false;
3297     }
3298
3299   /* Allow loads with zero step in inner-loop vectorization.  */
3300   if (loop_vinfo && integer_zerop (step))
3301     {
3302       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3303       DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3304       if (!nested_in_vect_loop_p (loop, stmt_info))
3305         return DR_IS_READ (dr);
3306       /* Allow references with zero step for outer loops marked
3307          with pragma omp simd only - it guarantees absence of
3308          loop-carried dependencies between inner loop iterations.  */
3309       if (loop->safelen < 2)
3310         {
3311           if (dump_enabled_p ())
3312             dump_printf_loc (MSG_NOTE, vect_location,
3313                              "zero step in inner loop of nest\n");
3314           return false;
3315         }
3316     }
3317
3318   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3319     {
3320       /* Interleaved accesses are not yet supported within outer-loop
3321         vectorization for references in the inner-loop.  */
3322       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3323       DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3324
3325       /* For the rest of the analysis we use the outer-loop step.  */
3326       step = STMT_VINFO_DR_STEP (stmt_info);
3327       if (integer_zerop (step))
3328         {
3329           if (dump_enabled_p ())
3330             dump_printf_loc (MSG_NOTE, vect_location,
3331                              "zero step in outer loop.\n");
3332           return DR_IS_READ (dr);
3333         }
3334     }
3335
3336   /* Consecutive?  */
3337   if (TREE_CODE (step) == INTEGER_CST)
3338     {
3339       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3340       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3341           || (dr_step < 0
3342               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3343         {
3344           /* Mark that it is not interleaving.  */
3345           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3346           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3347           return true;
3348         }
3349     }
3350
3351   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3352     {
3353       if (dump_enabled_p ())
3354         dump_printf_loc (MSG_NOTE, vect_location,
3355                          "grouped access in outer loop.\n");
3356       return false;
3357     }
3358
3359
3360   /* Assume this is a DR handled by non-constant strided load case.  */
3361   if (TREE_CODE (step) != INTEGER_CST)
3362     return (STMT_VINFO_STRIDED_P (stmt_info)
3363             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3364                 || vect_analyze_group_access (vinfo, dr_info)));
3365
3366   /* Not consecutive access - check if it's a part of interleaving group.  */
3367   return vect_analyze_group_access (vinfo, dr_info);
3368 }
3369
3370 /* Compare two data-references DRA and DRB to group them into chunks
3371    suitable for grouping.  */
3372
3373 static int
3374 dr_group_sort_cmp (const void *dra_, const void *drb_)
3375 {
3376   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3377   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3378   data_reference_p dra = dra_info->dr;
3379   data_reference_p drb = drb_info->dr;
3380   int cmp;
3381
3382   /* Stabilize sort.  */
3383   if (dra == drb)
3384     return 0;
3385
3386   /* Different group IDs lead never belong to the same group.  */
3387   if (dra_info->group != drb_info->group)
3388     return dra_info->group < drb_info->group ? -1 : 1;
3389
3390   /* Ordering of DRs according to base.  */
3391   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3392                                DR_BASE_ADDRESS (drb));
3393   if (cmp != 0)
3394     return cmp;
3395
3396   /* And according to DR_OFFSET.  */
3397   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3398   if (cmp != 0)
3399     return cmp;
3400
3401   /* Put reads before writes.  */
3402   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3403     return DR_IS_READ (dra) ? -1 : 1;
3404
3405   /* Then sort after access size.  */
3406   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3407                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3408   if (cmp != 0)
3409     return cmp;
3410
3411   /* And after step.  */
3412   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3413   if (cmp != 0)
3414     return cmp;
3415
3416   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3417   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3418   if (cmp == 0)
3419     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3420   return cmp;
3421 }
3422
3423 /* If OP is the result of a conversion, return the unconverted value,
3424    otherwise return null.  */
3425
3426 static tree
3427 strip_conversion (tree op)
3428 {
3429   if (TREE_CODE (op) != SSA_NAME)
3430     return NULL_TREE;
3431   gimple *stmt = SSA_NAME_DEF_STMT (op);
3432   if (!is_gimple_assign (stmt)
3433       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3434     return NULL_TREE;
3435   return gimple_assign_rhs1 (stmt);
3436 }
3437
3438 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3439    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3440    be grouped in SLP mode.  */
3441
3442 static bool
3443 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3444                    bool allow_slp_p)
3445 {
3446   if (gimple_assign_single_p (stmt1_info->stmt))
3447     return gimple_assign_single_p (stmt2_info->stmt);
3448
3449   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3450   if (call1 && gimple_call_internal_p (call1))
3451     {
3452       /* Check for two masked loads or two masked stores.  */
3453       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3454       if (!call2 || !gimple_call_internal_p (call2))
3455         return false;
3456       internal_fn ifn = gimple_call_internal_fn (call1);
3457       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3458         return false;
3459       if (ifn != gimple_call_internal_fn (call2))
3460         return false;
3461
3462       /* Check that the masks are the same.  Cope with casts of masks,
3463          like those created by build_mask_conversion.  */
3464       tree mask1 = gimple_call_arg (call1, 2);
3465       tree mask2 = gimple_call_arg (call2, 2);
3466       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3467         {
3468           mask1 = strip_conversion (mask1);
3469           if (!mask1)
3470             return false;
3471           mask2 = strip_conversion (mask2);
3472           if (!mask2)
3473             return false;
3474           if (!operand_equal_p (mask1, mask2, 0))
3475             return false;
3476         }
3477       return true;
3478     }
3479
3480   return false;
3481 }
3482
3483 /* Function vect_analyze_data_ref_accesses.
3484
3485    Analyze the access pattern of all the data references in the loop.
3486
3487    FORNOW: the only access pattern that is considered vectorizable is a
3488            simple step 1 (consecutive) access.
3489
3490    FORNOW: handle only arrays and pointer accesses.  */
3491
3492 opt_result
3493 vect_analyze_data_ref_accesses (vec_info *vinfo,
3494                                 vec<int> *dataref_groups)
3495 {
3496   unsigned int i;
3497   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3498
3499   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3500
3501   if (datarefs.is_empty ())
3502     return opt_result::success ();
3503
3504   /* Sort the array of datarefs to make building the interleaving chains
3505      linear.  Don't modify the original vector's order, it is needed for
3506      determining what dependencies are reversed.  */
3507   vec<dr_vec_info *> datarefs_copy;
3508   datarefs_copy.create (datarefs.length ());
3509   for (unsigned i = 0; i < datarefs.length (); i++)
3510     {
3511       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3512       /* If the caller computed DR grouping use that, otherwise group by
3513          basic blocks.  */
3514       if (dataref_groups)
3515         dr_info->group = (*dataref_groups)[i];
3516       else
3517         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3518       datarefs_copy.quick_push (dr_info);
3519     }
3520   datarefs_copy.qsort (dr_group_sort_cmp);
3521   hash_set<stmt_vec_info> to_fixup;
3522
3523   /* Build the interleaving chains.  */
3524   for (i = 0; i < datarefs_copy.length () - 1;)
3525     {
3526       dr_vec_info *dr_info_a = datarefs_copy[i];
3527       data_reference_p dra = dr_info_a->dr;
3528       int dra_group_id = dr_info_a->group;
3529       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3530       stmt_vec_info lastinfo = NULL;
3531       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3532           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3533         {
3534           ++i;
3535           continue;
3536         }
3537       for (i = i + 1; i < datarefs_copy.length (); ++i)
3538         {
3539           dr_vec_info *dr_info_b = datarefs_copy[i];
3540           data_reference_p drb = dr_info_b->dr;
3541           int drb_group_id = dr_info_b->group;
3542           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3543           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3544               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3545             break;
3546
3547           /* ???  Imperfect sorting (non-compatible types, non-modulo
3548              accesses, same accesses) can lead to a group to be artificially
3549              split here as we don't just skip over those.  If it really
3550              matters we can push those to a worklist and re-iterate
3551              over them.  The we can just skip ahead to the next DR here.  */
3552
3553           /* DRs in a different DR group should not be put into the same
3554              interleaving group.  */
3555           if (dra_group_id != drb_group_id)
3556             break;
3557
3558           /* Check that the data-refs have same first location (except init)
3559              and they are both either store or load (not load and store,
3560              not masked loads or stores).  */
3561           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3562               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3563                                         DR_BASE_ADDRESS (drb)) != 0
3564               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3565               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3566             break;
3567
3568           /* Check that the data-refs have the same constant size.  */
3569           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3570           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3571           if (!tree_fits_uhwi_p (sza)
3572               || !tree_fits_uhwi_p (szb)
3573               || !tree_int_cst_equal (sza, szb))
3574             break;
3575
3576           /* Check that the data-refs have the same step.  */
3577           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3578             break;
3579
3580           /* Check the types are compatible.
3581              ???  We don't distinguish this during sorting.  */
3582           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3583                                    TREE_TYPE (DR_REF (drb))))
3584             break;
3585
3586           /* Check that the DR_INITs are compile-time constants.  */
3587           if (!tree_fits_shwi_p (DR_INIT (dra))
3588               || !tree_fits_shwi_p (DR_INIT (drb)))
3589             break;
3590
3591           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3592              just hold extra information.  */
3593           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3594               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3595               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3596             break;
3597
3598           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3599           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3600           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3601           HOST_WIDE_INT init_prev
3602             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3603           gcc_assert (init_a <= init_b
3604                       && init_a <= init_prev
3605                       && init_prev <= init_b);
3606
3607           /* Do not place the same access in the interleaving chain twice.  */
3608           if (init_b == init_prev)
3609             {
3610               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3611                           < gimple_uid (DR_STMT (drb)));
3612               /* Simply link in duplicates and fix up the chain below.  */
3613             }
3614           else
3615             {
3616               /* If init_b == init_a + the size of the type * k, we have an
3617                  interleaving, and DRA is accessed before DRB.  */
3618               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3619               if (type_size_a == 0
3620                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3621                       % type_size_a != 0))
3622                 break;
3623
3624               /* If we have a store, the accesses are adjacent.  This splits
3625                  groups into chunks we support (we don't support vectorization
3626                  of stores with gaps).  */
3627               if (!DR_IS_READ (dra)
3628                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3629                       != type_size_a))
3630                 break;
3631
3632               /* If the step (if not zero or non-constant) is smaller than the
3633                  difference between data-refs' inits this splits groups into
3634                  suitable sizes.  */
3635               if (tree_fits_shwi_p (DR_STEP (dra)))
3636                 {
3637                   unsigned HOST_WIDE_INT step
3638                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3639                   if (step != 0
3640                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3641                     break;
3642                 }
3643             }
3644
3645           if (dump_enabled_p ())
3646             dump_printf_loc (MSG_NOTE, vect_location,
3647                              DR_IS_READ (dra)
3648                              ? "Detected interleaving load %T and %T\n"
3649                              : "Detected interleaving store %T and %T\n",
3650                              DR_REF (dra), DR_REF (drb));
3651
3652           /* Link the found element into the group list.  */
3653           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3654             {
3655               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3656               lastinfo = stmtinfo_a;
3657             }
3658           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3659           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3660           lastinfo = stmtinfo_b;
3661
3662           if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3663             {
3664               STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3665                 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3666
3667               if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3668                 dump_printf_loc (MSG_NOTE, vect_location,
3669                                  "Load suitable for SLP vectorization only.\n");
3670             }
3671
3672           if (init_b == init_prev
3673               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3674               && dump_enabled_p ())
3675             dump_printf_loc (MSG_NOTE, vect_location,
3676                              "Queuing group with duplicate access for fixup\n");
3677         }
3678     }
3679
3680   /* Fixup groups with duplicate entries by splitting it.  */
3681   while (1)
3682     {
3683       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3684       if (!(it != to_fixup.end ()))
3685         break;
3686       stmt_vec_info grp = *it;
3687       to_fixup.remove (grp);
3688
3689       /* Find the earliest duplicate group member.  */
3690       unsigned first_duplicate = -1u;
3691       stmt_vec_info next, g = grp;
3692       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3693         {
3694           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3695                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3696               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3697             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3698           g = next;
3699         }
3700       if (first_duplicate == -1U)
3701         continue;
3702
3703       /* Then move all stmts after the first duplicate to a new group.
3704          Note this is a heuristic but one with the property that *it
3705          is fixed up completely.  */
3706       g = grp;
3707       stmt_vec_info newgroup = NULL, ng = grp;
3708       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3709         {
3710           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3711             {
3712               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3713               if (!newgroup)
3714                 {
3715                   newgroup = next;
3716                   STMT_VINFO_SLP_VECT_ONLY (newgroup)
3717                     = STMT_VINFO_SLP_VECT_ONLY (grp);
3718                 }
3719               else
3720                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3721               ng = next;
3722               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3723             }
3724           else
3725             g = DR_GROUP_NEXT_ELEMENT (g);
3726         }
3727       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3728
3729       /* Fixup the new group which still may contain duplicates.  */
3730       to_fixup.add (newgroup);
3731     }
3732
3733   dr_vec_info *dr_info;
3734   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3735     {
3736       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3737           && !vect_analyze_data_ref_access (vinfo, dr_info))
3738         {
3739           if (dump_enabled_p ())
3740             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741                              "not vectorized: complicated access pattern.\n");
3742
3743           if (is_a <bb_vec_info> (vinfo))
3744             {
3745               /* Mark the statement as not vectorizable.  */
3746               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3747               continue;
3748             }
3749           else
3750             {
3751               datarefs_copy.release ();
3752               return opt_result::failure_at (dr_info->stmt->stmt,
3753                                              "not vectorized:"
3754                                              " complicated access pattern.\n");
3755             }
3756         }
3757     }
3758
3759   datarefs_copy.release ();
3760   return opt_result::success ();
3761 }
3762
3763 /* Function vect_vfa_segment_size.
3764
3765    Input:
3766      DR_INFO: The data reference.
3767      LENGTH_FACTOR: segment length to consider.
3768
3769    Return a value suitable for the dr_with_seg_len::seg_len field.
3770    This is the "distance travelled" by the pointer from the first
3771    iteration in the segment to the last.  Note that it does not include
3772    the size of the access; in effect it only describes the first byte.  */
3773
3774 static tree
3775 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3776 {
3777   length_factor = size_binop (MINUS_EXPR,
3778                               fold_convert (sizetype, length_factor),
3779                               size_one_node);
3780   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3781                      length_factor);
3782 }
3783
3784 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3785    gives the worst-case number of bytes covered by the segment.  */
3786
3787 static unsigned HOST_WIDE_INT
3788 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3789 {
3790   stmt_vec_info stmt_vinfo = dr_info->stmt;
3791   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3792   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3793   unsigned HOST_WIDE_INT access_size = ref_size;
3794   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3795     {
3796       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3797       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3798     }
3799   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3800   int misalignment;
3801   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3802       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3803       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3804           == dr_explicit_realign_optimized))
3805     {
3806       /* We might access a full vector's worth.  */
3807       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3808     }
3809   return access_size;
3810 }
3811
3812 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3813    describes.  */
3814
3815 static unsigned int
3816 vect_vfa_align (dr_vec_info *dr_info)
3817 {
3818   return dr_alignment (dr_info->dr);
3819 }
3820
3821 /* Function vect_no_alias_p.
3822
3823    Given data references A and B with equal base and offset, see whether
3824    the alias relation can be decided at compilation time.  Return 1 if
3825    it can and the references alias, 0 if it can and the references do
3826    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3827    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3828    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3829
3830 static int
3831 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3832                          tree segment_length_a, tree segment_length_b,
3833                          unsigned HOST_WIDE_INT access_size_a,
3834                          unsigned HOST_WIDE_INT access_size_b)
3835 {
3836   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3837   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3838   poly_uint64 const_length_a;
3839   poly_uint64 const_length_b;
3840
3841   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3842      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3843      [a, a+12) */
3844   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3845     {
3846       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3847       offset_a -= const_length_a;
3848     }
3849   else
3850     const_length_a = tree_to_poly_uint64 (segment_length_a);
3851   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3852     {
3853       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3854       offset_b -= const_length_b;
3855     }
3856   else
3857     const_length_b = tree_to_poly_uint64 (segment_length_b);
3858
3859   const_length_a += access_size_a;
3860   const_length_b += access_size_b;
3861
3862   if (ranges_known_overlap_p (offset_a, const_length_a,
3863                               offset_b, const_length_b))
3864     return 1;
3865
3866   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3867                                offset_b, const_length_b))
3868     return 0;
3869
3870   return -1;
3871 }
3872
3873 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3874    in DDR is >= VF.  */
3875
3876 static bool
3877 dependence_distance_ge_vf (data_dependence_relation *ddr,
3878                            unsigned int loop_depth, poly_uint64 vf)
3879 {
3880   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3881       || DDR_NUM_DIST_VECTS (ddr) == 0)
3882     return false;
3883
3884   /* If the dependence is exact, we should have limited the VF instead.  */
3885   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3886
3887   unsigned int i;
3888   lambda_vector dist_v;
3889   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3890     {
3891       HOST_WIDE_INT dist = dist_v[loop_depth];
3892       if (dist != 0
3893           && !(dist > 0 && DDR_REVERSED_P (ddr))
3894           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3895         return false;
3896     }
3897
3898   if (dump_enabled_p ())
3899     dump_printf_loc (MSG_NOTE, vect_location,
3900                      "dependence distance between %T and %T is >= VF\n",
3901                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3902
3903   return true;
3904 }
3905
3906 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3907
3908 static void
3909 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3910 {
3911   dump_printf (dump_kind, "%s (%T) >= ",
3912                lower_bound.unsigned_p ? "unsigned" : "abs",
3913                lower_bound.expr);
3914   dump_dec (dump_kind, lower_bound.min_value);
3915 }
3916
3917 /* Record that the vectorized loop requires the vec_lower_bound described
3918    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3919
3920 static void
3921 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3922                         poly_uint64 min_value)
3923 {
3924   vec<vec_lower_bound> &lower_bounds
3925     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3926   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3927     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3928       {
3929         unsigned_p &= lower_bounds[i].unsigned_p;
3930         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3931         if (lower_bounds[i].unsigned_p != unsigned_p
3932             || maybe_lt (lower_bounds[i].min_value, min_value))
3933           {
3934             lower_bounds[i].unsigned_p = unsigned_p;
3935             lower_bounds[i].min_value = min_value;
3936             if (dump_enabled_p ())
3937               {
3938                 dump_printf_loc (MSG_NOTE, vect_location,
3939                                  "updating run-time check to ");
3940                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3941                 dump_printf (MSG_NOTE, "\n");
3942               }
3943           }
3944         return;
3945       }
3946
3947   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3948   if (dump_enabled_p ())
3949     {
3950       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3951       dump_lower_bound (MSG_NOTE, lower_bound);
3952       dump_printf (MSG_NOTE, "\n");
3953     }
3954   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3955 }
3956
3957 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3958    will span fewer than GAP bytes.  */
3959
3960 static bool
3961 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3962                   poly_int64 gap)
3963 {
3964   stmt_vec_info stmt_info = dr_info->stmt;
3965   HOST_WIDE_INT count
3966     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3967   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3968     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3969   return (estimated_poly_value (gap)
3970           <= count * vect_get_scalar_dr_size (dr_info));
3971 }
3972
3973 /* Return true if we know that there is no alias between DR_INFO_A and
3974    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3975    When returning true, set *LOWER_BOUND_OUT to this N.  */
3976
3977 static bool
3978 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3979                                 poly_uint64 *lower_bound_out)
3980 {
3981   /* Check that there is a constant gap of known sign between DR_A
3982      and DR_B.  */
3983   data_reference *dr_a = dr_info_a->dr;
3984   data_reference *dr_b = dr_info_b->dr;
3985   poly_int64 init_a, init_b;
3986   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3987       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3988       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3989       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3990       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3991       || !ordered_p (init_a, init_b))
3992     return false;
3993
3994   /* Sort DR_A and DR_B by the address they access.  */
3995   if (maybe_lt (init_b, init_a))
3996     {
3997       std::swap (init_a, init_b);
3998       std::swap (dr_info_a, dr_info_b);
3999       std::swap (dr_a, dr_b);
4000     }
4001
4002   /* If the two accesses could be dependent within a scalar iteration,
4003      make sure that we'd retain their order.  */
4004   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
4005       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
4006     return false;
4007
4008   /* There is no alias if abs (DR_STEP) is greater than or equal to
4009      the bytes spanned by the combination of the two accesses.  */
4010   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
4011   return true;
4012 }
4013
4014 /* Function vect_prune_runtime_alias_test_list.
4015
4016    Prune a list of ddrs to be tested at run-time by versioning for alias.
4017    Merge several alias checks into one if possible.
4018    Return FALSE if resulting list of ddrs is longer then allowed by
4019    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
4020
4021 opt_result
4022 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
4023 {
4024   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
4025   hash_set <tree_pair_hash> compared_objects;
4026
4027   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
4028   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
4029     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4030   const vec<vec_object_pair> &check_unequal_addrs
4031     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4032   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4033   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
4034
4035   ddr_p ddr;
4036   unsigned int i;
4037   tree length_factor;
4038
4039   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
4040
4041   /* Step values are irrelevant for aliasing if the number of vector
4042      iterations is equal to the number of scalar iterations (which can
4043      happen for fully-SLP loops).  */
4044   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
4045
4046   if (!vf_one_p)
4047     {
4048       /* Convert the checks for nonzero steps into bound tests.  */
4049       tree value;
4050       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
4051         vect_check_lower_bound (loop_vinfo, value, true, 1);
4052     }
4053
4054   if (may_alias_ddrs.is_empty ())
4055     return opt_result::success ();
4056
4057   comp_alias_ddrs.create (may_alias_ddrs.length ());
4058
4059   unsigned int loop_depth
4060     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
4061                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
4062
4063   /* First, we collect all data ref pairs for aliasing checks.  */
4064   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
4065     {
4066       poly_uint64 lower_bound;
4067       tree segment_length_a, segment_length_b;
4068       unsigned HOST_WIDE_INT access_size_a, access_size_b;
4069       unsigned int align_a, align_b;
4070
4071       /* Ignore the alias if the VF we chose ended up being no greater
4072          than the dependence distance.  */
4073       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
4074         continue;
4075
4076       if (DDR_OBJECT_A (ddr))
4077         {
4078           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
4079           if (!compared_objects.add (new_pair))
4080             {
4081               if (dump_enabled_p ())
4082                 dump_printf_loc (MSG_NOTE, vect_location,
4083                                  "checking that %T and %T"
4084                                  " have different addresses\n",
4085                                  new_pair.first, new_pair.second);
4086               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
4087             }
4088           continue;
4089         }
4090
4091       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
4092       stmt_vec_info stmt_info_a = dr_info_a->stmt;
4093
4094       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
4095       stmt_vec_info stmt_info_b = dr_info_b->stmt;
4096
4097       bool preserves_scalar_order_p
4098         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
4099       bool ignore_step_p
4100           = (vf_one_p
4101              && (preserves_scalar_order_p
4102                  || operand_equal_p (DR_STEP (dr_info_a->dr),
4103                                      DR_STEP (dr_info_b->dr))));
4104
4105       /* Skip the pair if inter-iteration dependencies are irrelevant
4106          and intra-iteration dependencies are guaranteed to be honored.  */
4107       if (ignore_step_p
4108           && (preserves_scalar_order_p
4109               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4110                                                  &lower_bound)))
4111         {
4112           if (dump_enabled_p ())
4113             dump_printf_loc (MSG_NOTE, vect_location,
4114                              "no need for alias check between "
4115                              "%T and %T when VF is 1\n",
4116                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4117           continue;
4118         }
4119
4120       /* See whether we can handle the alias using a bounds check on
4121          the step, and whether that's likely to be the best approach.
4122          (It might not be, for example, if the minimum step is much larger
4123          than the number of bytes handled by one vector iteration.)  */
4124       if (!ignore_step_p
4125           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4126           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4127                                              &lower_bound)
4128           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4129               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4130         {
4131           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4132           if (dump_enabled_p ())
4133             {
4134               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4135                                "%T and %T when the step %T is outside ",
4136                                DR_REF (dr_info_a->dr),
4137                                DR_REF (dr_info_b->dr),
4138                                DR_STEP (dr_info_a->dr));
4139               if (unsigned_p)
4140                 dump_printf (MSG_NOTE, "[0");
4141               else
4142                 {
4143                   dump_printf (MSG_NOTE, "(");
4144                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4145                 }
4146               dump_printf (MSG_NOTE, ", ");
4147               dump_dec (MSG_NOTE, lower_bound);
4148               dump_printf (MSG_NOTE, ")\n");
4149             }
4150           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4151                                   unsigned_p, lower_bound);
4152           continue;
4153         }
4154
4155       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4156       if (dr_group_first_a)
4157         {
4158           stmt_info_a = dr_group_first_a;
4159           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4160         }
4161
4162       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4163       if (dr_group_first_b)
4164         {
4165           stmt_info_b = dr_group_first_b;
4166           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4167         }
4168
4169       if (ignore_step_p)
4170         {
4171           segment_length_a = size_zero_node;
4172           segment_length_b = size_zero_node;
4173         }
4174       else
4175         {
4176           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4177                                 DR_STEP (dr_info_b->dr), 0))
4178             length_factor = scalar_loop_iters;
4179           else
4180             length_factor = size_int (vect_factor);
4181           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4182           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4183         }
4184       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4185       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4186       align_a = vect_vfa_align (dr_info_a);
4187       align_b = vect_vfa_align (dr_info_b);
4188
4189       /* See whether the alias is known at compilation time.  */
4190       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4191                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4192           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4193                               DR_OFFSET (dr_info_b->dr), 0)
4194           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4195           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4196           && poly_int_tree_p (segment_length_a)
4197           && poly_int_tree_p (segment_length_b))
4198         {
4199           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4200                                              segment_length_a,
4201                                              segment_length_b,
4202                                              access_size_a,
4203                                              access_size_b);
4204           if (res >= 0 && dump_enabled_p ())
4205             {
4206               dump_printf_loc (MSG_NOTE, vect_location,
4207                                "can tell at compile time that %T and %T",
4208                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4209               if (res == 0)
4210                 dump_printf (MSG_NOTE, " do not alias\n");
4211               else
4212                 dump_printf (MSG_NOTE, " alias\n");
4213             }
4214
4215           if (res == 0)
4216             continue;
4217
4218           if (res == 1)
4219             return opt_result::failure_at (stmt_info_b->stmt,
4220                                            "not vectorized:"
4221                                            " compilation time alias: %G%G",
4222                                            stmt_info_a->stmt,
4223                                            stmt_info_b->stmt);
4224         }
4225
4226       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4227                             access_size_a, align_a);
4228       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4229                             access_size_b, align_b);
4230       /* Canonicalize the order to be the one that's needed for accurate
4231          RAW, WAR and WAW flags, in cases where the data references are
4232          well-ordered.  The order doesn't really matter otherwise,
4233          but we might as well be consistent.  */
4234       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4235         std::swap (dr_a, dr_b);
4236
4237       dr_with_seg_len_pair_t dr_with_seg_len_pair
4238         (dr_a, dr_b, (preserves_scalar_order_p
4239                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4240                       : dr_with_seg_len_pair_t::REORDERED));
4241
4242       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4243     }
4244
4245   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4246
4247   unsigned int count = (comp_alias_ddrs.length ()
4248                         + check_unequal_addrs.length ());
4249
4250   if (count
4251       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4252           == VECT_COST_MODEL_VERY_CHEAP))
4253     return opt_result::failure_at
4254       (vect_location, "would need a runtime alias check\n");
4255
4256   if (dump_enabled_p ())
4257     dump_printf_loc (MSG_NOTE, vect_location,
4258                      "improved number of alias checks from %d to %d\n",
4259                      may_alias_ddrs.length (), count);
4260   unsigned limit = param_vect_max_version_for_alias_checks;
4261   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4262     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4263   if (count > limit)
4264     return opt_result::failure_at
4265       (vect_location,
4266        "number of versioning for alias run-time tests exceeds %d "
4267        "(--param vect-max-version-for-alias-checks)\n", limit);
4268
4269   return opt_result::success ();
4270 }
4271
4272 /* Check whether we can use an internal function for a gather load
4273    or scatter store.  READ_P is true for loads and false for stores.
4274    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4275    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4276    is the type of the offset that is being applied to the invariant
4277    base address.  SCALE is the amount by which the offset should
4278    be multiplied *after* it has been converted to address width.
4279
4280    Return true if the function is supported, storing the function id in
4281    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
4282
4283    If we can use gather and store the possible else values in ELSVALS.  */
4284
4285 bool
4286 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4287                           tree vectype, tree memory_type, tree offset_type,
4288                           int scale, internal_fn *ifn_out,
4289                           tree *offset_vectype_out, vec<int> *elsvals)
4290 {
4291   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4292   unsigned int element_bits = vector_element_bits (vectype);
4293   if (element_bits != memory_bits)
4294     /* For now the vector elements must be the same width as the
4295        memory elements.  */
4296     return false;
4297
4298   /* Work out which function we need.  */
4299   internal_fn ifn, alt_ifn, alt_ifn2;
4300   if (read_p)
4301     {
4302       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4303       alt_ifn = IFN_MASK_GATHER_LOAD;
4304       /* When target supports MASK_LEN_GATHER_LOAD, we always
4305          use MASK_LEN_GATHER_LOAD regardless whether len and
4306          mask are valid or not.  */
4307       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4308     }
4309   else
4310     {
4311       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4312       alt_ifn = IFN_MASK_SCATTER_STORE;
4313       /* When target supports MASK_LEN_SCATTER_STORE, we always
4314          use MASK_LEN_SCATTER_STORE regardless whether len and
4315          mask are valid or not.  */
4316       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4317     }
4318
4319   for (;;)
4320     {
4321       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4322       if (!offset_vectype)
4323         return false;
4324
4325       /* Test whether the target supports this combination.  */
4326       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4327                                                   offset_vectype, scale,
4328                                                   elsvals))
4329         {
4330           *ifn_out = ifn;
4331           *offset_vectype_out = offset_vectype;
4332           return true;
4333         }
4334       else if (!masked_p
4335                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4336                                                           memory_type,
4337                                                           offset_vectype,
4338                                                           scale, elsvals))
4339         {
4340           *ifn_out = alt_ifn;
4341           *offset_vectype_out = offset_vectype;
4342           return true;
4343         }
4344       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4345                                                        memory_type,
4346                                                        offset_vectype, scale,
4347                                                        elsvals))
4348         {
4349           *ifn_out = alt_ifn2;
4350           *offset_vectype_out = offset_vectype;
4351           return true;
4352         }
4353
4354       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4355           && TYPE_PRECISION (offset_type) >= element_bits)
4356         return false;
4357
4358       offset_type = build_nonstandard_integer_type
4359         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4360     }
4361 }
4362
4363 /* STMT_INFO is a call to an internal gather load or scatter store function.
4364    Describe the operation in INFO.  */
4365
4366 static void
4367 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4368                                    gather_scatter_info *info)
4369 {
4370   gcall *call = as_a <gcall *> (stmt_info->stmt);
4371   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4372   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4373
4374   info->ifn = gimple_call_internal_fn (call);
4375   info->decl = NULL_TREE;
4376   info->base = gimple_call_arg (call, 0);
4377   info->offset = gimple_call_arg (call, 1);
4378   info->offset_dt = vect_unknown_def_type;
4379   info->offset_vectype = NULL_TREE;
4380   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4381   info->element_type = TREE_TYPE (vectype);
4382   info->memory_type = TREE_TYPE (DR_REF (dr));
4383 }
4384
4385 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4386    gather load or scatter store.  Describe the operation in *INFO if so.
4387    If it is suitable and ELSVALS is nonzero store the supported else values
4388    in the vector it points to.  */
4389
4390 bool
4391 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4392                            gather_scatter_info *info, vec<int> *elsvals)
4393 {
4394   HOST_WIDE_INT scale = 1;
4395   poly_int64 pbitpos, pbitsize;
4396   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398   tree offtype = NULL_TREE;
4399   tree decl = NULL_TREE, base, off;
4400   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4401   tree memory_type = TREE_TYPE (DR_REF (dr));
4402   machine_mode pmode;
4403   int punsignedp, reversep, pvolatilep = 0;
4404   internal_fn ifn;
4405   tree offset_vectype;
4406   bool masked_p = false;
4407
4408   /* See whether this is already a call to a gather/scatter internal function.
4409      If not, see whether it's a masked load or store.  */
4410   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4411   if (call && gimple_call_internal_p (call))
4412     {
4413       ifn = gimple_call_internal_fn (call);
4414       if (internal_gather_scatter_fn_p (ifn))
4415         {
4416           vect_describe_gather_scatter_call (stmt_info, info);
4417
4418           /* In pattern recog we simply used a ZERO else value that
4419              we need to correct here.  To that end just re-use the
4420              (already succesful) check if we support a gather IFN
4421              and have it populate the else values.  */
4422           if (DR_IS_READ (dr) && internal_fn_mask_index (ifn) >= 0 && elsvals)
4423             supports_vec_gather_load_p (TYPE_MODE (vectype), elsvals);
4424           return true;
4425         }
4426       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4427     }
4428
4429   /* ???  For epilogues we adjust DR_REF to make the following stmt-based
4430      analysis work, but this adjustment doesn't work for epilogues of
4431      epilogues during transform, so disable gather/scatter in that case.  */
4432   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4433       && LOOP_VINFO_EPILOGUE_P (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)))
4434     return false;
4435
4436   /* True if we should aim to use internal functions rather than
4437      built-in functions.  */
4438   bool use_ifn_p = (DR_IS_READ (dr)
4439                     ? supports_vec_gather_load_p (TYPE_MODE (vectype),
4440                                                   elsvals)
4441                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4442
4443   base = DR_REF (dr);
4444   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4445      see if we can use the def stmt of the address.  */
4446   if (masked_p
4447       && TREE_CODE (base) == MEM_REF
4448       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4449       && integer_zerop (TREE_OPERAND (base, 1))
4450       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4451     {
4452       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4453       if (is_gimple_assign (def_stmt)
4454           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4455         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4456     }
4457
4458   /* The gather and scatter builtins need address of the form
4459      loop_invariant + vector * {1, 2, 4, 8}
4460      or
4461      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4462      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4463      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4464      multiplications and additions in it.  To get a vector, we need
4465      a single SSA_NAME that will be defined in the loop and will
4466      contain everything that is not loop invariant and that can be
4467      vectorized.  The following code attempts to find such a preexistng
4468      SSA_NAME OFF and put the loop invariants into a tree BASE
4469      that can be gimplified before the loop.  */
4470   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4471                               &punsignedp, &reversep, &pvolatilep);
4472   if (reversep)
4473     return false;
4474
4475   /* PR 107346.  Packed structs can have fields at offsets that are not
4476      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4477   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4478     return false;
4479
4480   /* We need to be able to form an address to the base which for example
4481      isn't possible for hard registers.  */
4482   if (may_be_nonaddressable_p (base))
4483     return false;
4484
4485   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4486
4487   if (TREE_CODE (base) == MEM_REF)
4488     {
4489       if (!integer_zerop (TREE_OPERAND (base, 1)))
4490         {
4491           if (off == NULL_TREE)
4492             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4493           else
4494             off = size_binop (PLUS_EXPR, off,
4495                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4496         }
4497       base = TREE_OPERAND (base, 0);
4498     }
4499   else
4500     base = build_fold_addr_expr (base);
4501
4502   if (off == NULL_TREE)
4503     off = size_zero_node;
4504
4505   /* If base is not loop invariant, either off is 0, then we start with just
4506      the constant offset in the loop invariant BASE and continue with base
4507      as OFF, otherwise give up.
4508      We could handle that case by gimplifying the addition of base + off
4509      into some SSA_NAME and use that as off, but for now punt.  */
4510   if (!expr_invariant_in_loop_p (loop, base))
4511     {
4512       if (!integer_zerop (off))
4513         return false;
4514       off = base;
4515       base = size_int (pbytepos);
4516     }
4517   /* Otherwise put base + constant offset into the loop invariant BASE
4518      and continue with OFF.  */
4519   else
4520     {
4521       base = fold_convert (sizetype, base);
4522       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4523     }
4524
4525   /* OFF at this point may be either a SSA_NAME or some tree expression
4526      from get_inner_reference.  Try to peel off loop invariants from it
4527      into BASE as long as possible.  */
4528   STRIP_NOPS (off);
4529   while (offtype == NULL_TREE)
4530     {
4531       enum tree_code code;
4532       tree op0, op1, add = NULL_TREE;
4533
4534       if (TREE_CODE (off) == SSA_NAME)
4535         {
4536           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4537
4538           if (expr_invariant_in_loop_p (loop, off))
4539             return false;
4540
4541           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4542             break;
4543
4544           op0 = gimple_assign_rhs1 (def_stmt);
4545           code = gimple_assign_rhs_code (def_stmt);
4546           op1 = gimple_assign_rhs2 (def_stmt);
4547         }
4548       else
4549         {
4550           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4551             return false;
4552           code = TREE_CODE (off);
4553           extract_ops_from_tree (off, &code, &op0, &op1);
4554         }
4555       switch (code)
4556         {
4557         case POINTER_PLUS_EXPR:
4558         case PLUS_EXPR:
4559           if (expr_invariant_in_loop_p (loop, op0))
4560             {
4561               add = op0;
4562               off = op1;
4563             do_add:
4564               add = fold_convert (sizetype, add);
4565               if (scale != 1)
4566                 add = size_binop (MULT_EXPR, add, size_int (scale));
4567               base = size_binop (PLUS_EXPR, base, add);
4568               continue;
4569             }
4570           if (expr_invariant_in_loop_p (loop, op1))
4571             {
4572               add = op1;
4573               off = op0;
4574               goto do_add;
4575             }
4576           break;
4577         case MINUS_EXPR:
4578           if (expr_invariant_in_loop_p (loop, op1))
4579             {
4580               add = fold_convert (sizetype, op1);
4581               add = size_binop (MINUS_EXPR, size_zero_node, add);
4582               off = op0;
4583               goto do_add;
4584             }
4585           break;
4586         case MULT_EXPR:
4587           if (scale == 1 && tree_fits_shwi_p (op1))
4588             {
4589               int new_scale = tree_to_shwi (op1);
4590               /* Only treat this as a scaling operation if the target
4591                  supports it for at least some offset type.  */
4592               if (use_ifn_p
4593                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4594                                                 masked_p, vectype, memory_type,
4595                                                 signed_char_type_node,
4596                                                 new_scale, &ifn,
4597                                                 &offset_vectype,
4598                                                 elsvals)
4599                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4600                                                 masked_p, vectype, memory_type,
4601                                                 unsigned_char_type_node,
4602                                                 new_scale, &ifn,
4603                                                 &offset_vectype,
4604                                                 elsvals))
4605                 break;
4606               scale = new_scale;
4607               off = op0;
4608               continue;
4609             }
4610           break;
4611         case SSA_NAME:
4612           off = op0;
4613           continue;
4614         CASE_CONVERT:
4615           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4616               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4617             break;
4618
4619           /* Don't include the conversion if the target is happy with
4620              the current offset type.  */
4621           if (use_ifn_p
4622               && TREE_CODE (off) == SSA_NAME
4623               && !POINTER_TYPE_P (TREE_TYPE (off))
4624               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4625                                            masked_p, vectype, memory_type,
4626                                            TREE_TYPE (off), scale, &ifn,
4627                                            &offset_vectype, elsvals))
4628             break;
4629
4630           if (TYPE_PRECISION (TREE_TYPE (op0))
4631               == TYPE_PRECISION (TREE_TYPE (off)))
4632             {
4633               off = op0;
4634               continue;
4635             }
4636
4637           /* Include the conversion if it is widening and we're using
4638              the IFN path or the target can handle the converted from
4639              offset or the current size is not already the same as the
4640              data vector element size.  */
4641           if ((TYPE_PRECISION (TREE_TYPE (op0))
4642                < TYPE_PRECISION (TREE_TYPE (off)))
4643               && (use_ifn_p
4644                   || (DR_IS_READ (dr)
4645                       ? (targetm.vectorize.builtin_gather
4646                          && targetm.vectorize.builtin_gather (vectype,
4647                                                               TREE_TYPE (op0),
4648                                                               scale))
4649                       : (targetm.vectorize.builtin_scatter
4650                          && targetm.vectorize.builtin_scatter (vectype,
4651                                                                TREE_TYPE (op0),
4652                                                                scale)))
4653                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4654                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4655             {
4656               off = op0;
4657               offtype = TREE_TYPE (off);
4658               STRIP_NOPS (off);
4659               continue;
4660             }
4661           break;
4662         default:
4663           break;
4664         }
4665       break;
4666     }
4667
4668   /* If at the end OFF still isn't a SSA_NAME or isn't
4669      defined in the loop, punt.  */
4670   if (TREE_CODE (off) != SSA_NAME
4671       || expr_invariant_in_loop_p (loop, off))
4672     return false;
4673
4674   if (offtype == NULL_TREE)
4675     offtype = TREE_TYPE (off);
4676
4677   if (use_ifn_p)
4678     {
4679       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4680                                      vectype, memory_type, offtype, scale,
4681                                      &ifn, &offset_vectype, elsvals))
4682         ifn = IFN_LAST;
4683       decl = NULL_TREE;
4684     }
4685   else
4686     {
4687       if (DR_IS_READ (dr))
4688         {
4689           if (targetm.vectorize.builtin_gather)
4690             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4691         }
4692       else
4693         {
4694           if (targetm.vectorize.builtin_scatter)
4695             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4696         }
4697       ifn = IFN_LAST;
4698       /* The offset vector type will be read from DECL when needed.  */
4699       offset_vectype = NULL_TREE;
4700     }
4701
4702   info->ifn = ifn;
4703   info->decl = decl;
4704   info->base = base;
4705   info->offset = off;
4706   info->offset_dt = vect_unknown_def_type;
4707   info->offset_vectype = offset_vectype;
4708   info->scale = scale;
4709   info->element_type = TREE_TYPE (vectype);
4710   info->memory_type = memory_type;
4711   return true;
4712 }
4713
4714 /* Find the data references in STMT, analyze them with respect to LOOP and
4715    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4716    be handled.  */
4717
4718 opt_result
4719 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4720                                vec<data_reference_p> *datarefs,
4721                                vec<int> *dataref_groups, int group_id)
4722 {
4723   /* We can ignore clobbers for dataref analysis - they are removed during
4724      loop vectorization and BB vectorization checks dependences with a
4725      stmt walk.  */
4726   if (gimple_clobber_p (stmt))
4727     return opt_result::success ();
4728
4729   if (gimple_has_volatile_ops (stmt))
4730     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4731                                    stmt);
4732
4733   if (stmt_can_throw_internal (cfun, stmt))
4734     return opt_result::failure_at (stmt,
4735                                    "not vectorized:"
4736                                    " statement can throw an exception: %G",
4737                                    stmt);
4738
4739   auto_vec<data_reference_p, 2> refs;
4740   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4741   if (!res)
4742     return res;
4743
4744   if (refs.is_empty ())
4745     return opt_result::success ();
4746
4747   if (refs.length () > 1)
4748     {
4749       while (!refs.is_empty ())
4750         free_data_ref (refs.pop ());
4751       return opt_result::failure_at (stmt,
4752                                      "not vectorized: more than one "
4753                                      "data ref in stmt: %G", stmt);
4754     }
4755
4756   data_reference_p dr = refs.pop ();
4757   if (gcall *call = dyn_cast <gcall *> (stmt))
4758     if (!gimple_call_internal_p (call)
4759         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4760             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4761       {
4762         free_data_ref (dr);
4763         return opt_result::failure_at (stmt,
4764                                        "not vectorized: dr in a call %G", stmt);
4765       }
4766
4767   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4768       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4769     {
4770       free_data_ref (dr);
4771       return opt_result::failure_at (stmt,
4772                                      "not vectorized:"
4773                                      " statement is an unsupported"
4774                                      " bitfield access %G", stmt);
4775     }
4776
4777   if (DR_BASE_ADDRESS (dr)
4778       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4779     {
4780       free_data_ref (dr);
4781       return opt_result::failure_at (stmt,
4782                                      "not vectorized:"
4783                                      " base addr of dr is a constant\n");
4784     }
4785
4786   /* Check whether this may be a SIMD lane access and adjust the
4787      DR to make it easier for us to handle it.  */
4788   if (loop
4789       && loop->simduid
4790       && (!DR_BASE_ADDRESS (dr)
4791           || !DR_OFFSET (dr)
4792           || !DR_INIT (dr)
4793           || !DR_STEP (dr)))
4794     {
4795       struct data_reference *newdr
4796         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4797                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4798       if (DR_BASE_ADDRESS (newdr)
4799           && DR_OFFSET (newdr)
4800           && DR_INIT (newdr)
4801           && DR_STEP (newdr)
4802           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4803           && integer_zerop (DR_STEP (newdr)))
4804         {
4805           tree base_address = DR_BASE_ADDRESS (newdr);
4806           tree off = DR_OFFSET (newdr);
4807           tree step = ssize_int (1);
4808           if (integer_zerop (off)
4809               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4810             {
4811               off = TREE_OPERAND (base_address, 1);
4812               base_address = TREE_OPERAND (base_address, 0);
4813             }
4814           STRIP_NOPS (off);
4815           if (TREE_CODE (off) == MULT_EXPR
4816               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4817             {
4818               step = TREE_OPERAND (off, 1);
4819               off = TREE_OPERAND (off, 0);
4820               STRIP_NOPS (off);
4821             }
4822           if (CONVERT_EXPR_P (off)
4823               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4824                   < TYPE_PRECISION (TREE_TYPE (off))))
4825             off = TREE_OPERAND (off, 0);
4826           if (TREE_CODE (off) == SSA_NAME)
4827             {
4828               gimple *def = SSA_NAME_DEF_STMT (off);
4829               /* Look through widening conversion.  */
4830               if (is_gimple_assign (def)
4831                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4832                 {
4833                   tree rhs1 = gimple_assign_rhs1 (def);
4834                   if (TREE_CODE (rhs1) == SSA_NAME
4835                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4836                       && (TYPE_PRECISION (TREE_TYPE (off))
4837                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4838                     def = SSA_NAME_DEF_STMT (rhs1);
4839                 }
4840               if (is_gimple_call (def)
4841                   && gimple_call_internal_p (def)
4842                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4843                 {
4844                   tree arg = gimple_call_arg (def, 0);
4845                   tree reft = TREE_TYPE (DR_REF (newdr));
4846                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4847                   arg = SSA_NAME_VAR (arg);
4848                   if (arg == loop->simduid
4849                       /* For now.  */
4850                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4851                     {
4852                       DR_BASE_ADDRESS (newdr) = base_address;
4853                       DR_OFFSET (newdr) = ssize_int (0);
4854                       DR_STEP (newdr) = step;
4855                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4856                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4857                       /* Mark as simd-lane access.  */
4858                       tree arg2 = gimple_call_arg (def, 1);
4859                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4860                       free_data_ref (dr);
4861                       datarefs->safe_push (newdr);
4862                       if (dataref_groups)
4863                         dataref_groups->safe_push (group_id);
4864                       return opt_result::success ();
4865                     }
4866                 }
4867             }
4868         }
4869       free_data_ref (newdr);
4870     }
4871
4872   datarefs->safe_push (dr);
4873   if (dataref_groups)
4874     dataref_groups->safe_push (group_id);
4875   return opt_result::success ();
4876 }
4877
4878 /* Function vect_analyze_data_refs.
4879
4880   Find all the data references in the loop or basic block.
4881
4882    The general structure of the analysis of data refs in the vectorizer is as
4883    follows:
4884    1- vect_analyze_data_refs(loop/bb): call
4885       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4886       in the loop/bb and their dependences.
4887    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4888    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4889    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4890
4891 */
4892
4893 opt_result
4894 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4895 {
4896   class loop *loop = NULL;
4897   unsigned int i;
4898   struct data_reference *dr;
4899   tree scalar_type;
4900
4901   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4902
4903   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4904     loop = LOOP_VINFO_LOOP (loop_vinfo);
4905
4906   /* Go through the data-refs, check that the analysis succeeded.  Update
4907      pointer from stmt_vec_info struct to DR and vectype.  */
4908
4909   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4910   FOR_EACH_VEC_ELT (datarefs, i, dr)
4911     {
4912       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4913       poly_uint64 vf;
4914
4915       gcc_assert (DR_REF (dr));
4916       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4917       gcc_assert (!stmt_info->dr_aux.dr);
4918       stmt_info->dr_aux.dr = dr;
4919       stmt_info->dr_aux.stmt = stmt_info;
4920
4921       /* Check that analysis of the data-ref succeeded.  */
4922       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4923           || !DR_STEP (dr))
4924         {
4925           bool maybe_gather
4926             = DR_IS_READ (dr)
4927               && !TREE_THIS_VOLATILE (DR_REF (dr));
4928           bool maybe_scatter
4929             = DR_IS_WRITE (dr)
4930               && !TREE_THIS_VOLATILE (DR_REF (dr));
4931
4932           /* If target supports vector gather loads or scatter stores,
4933              see if they can't be used.  */
4934           if (is_a <loop_vec_info> (vinfo)
4935               && !nested_in_vect_loop_p (loop, stmt_info))
4936             {
4937               if (maybe_gather || maybe_scatter)
4938                 {
4939                   if (maybe_gather)
4940                     gatherscatter = GATHER;
4941                   else
4942                     gatherscatter = SCATTER;
4943                 }
4944             }
4945
4946           if (gatherscatter == SG_NONE)
4947             {
4948               if (dump_enabled_p ())
4949                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950                                  "not vectorized: data ref analysis "
4951                                  "failed %G", stmt_info->stmt);
4952               if (is_a <bb_vec_info> (vinfo))
4953                 {
4954                   /* In BB vectorization the ref can still participate
4955                      in dependence analysis, we just can't vectorize it.  */
4956                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4957                   continue;
4958                 }
4959               return opt_result::failure_at (stmt_info->stmt,
4960                                              "not vectorized:"
4961                                              " data ref analysis failed: %G",
4962                                              stmt_info->stmt);
4963             }
4964         }
4965
4966       /* See if this was detected as SIMD lane access.  */
4967       if (dr->aux == (void *)-1
4968           || dr->aux == (void *)-2
4969           || dr->aux == (void *)-3
4970           || dr->aux == (void *)-4)
4971         {
4972           if (nested_in_vect_loop_p (loop, stmt_info))
4973             return opt_result::failure_at (stmt_info->stmt,
4974                                            "not vectorized:"
4975                                            " data ref analysis failed: %G",
4976                                            stmt_info->stmt);
4977           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4978             = -(uintptr_t) dr->aux;
4979         }
4980
4981       tree base = get_base_address (DR_REF (dr));
4982       if (base && VAR_P (base) && DECL_NONALIASED (base))
4983         {
4984           if (dump_enabled_p ())
4985             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4986                              "not vectorized: base object not addressable "
4987                              "for stmt: %G", stmt_info->stmt);
4988           if (is_a <bb_vec_info> (vinfo))
4989             {
4990               /* In BB vectorization the ref can still participate
4991                  in dependence analysis, we just can't vectorize it.  */
4992               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4993               continue;
4994             }
4995           return opt_result::failure_at (stmt_info->stmt,
4996                                          "not vectorized: base object not"
4997                                          " addressable for stmt: %G",
4998                                          stmt_info->stmt);
4999         }
5000
5001       if (is_a <loop_vec_info> (vinfo)
5002           && DR_STEP (dr)
5003           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
5004         {
5005           if (nested_in_vect_loop_p (loop, stmt_info))
5006             return opt_result::failure_at (stmt_info->stmt,
5007                                            "not vectorized: "
5008                                            "not suitable for strided load %G",
5009                                            stmt_info->stmt);
5010           STMT_VINFO_STRIDED_P (stmt_info) = true;
5011         }
5012
5013       /* Update DR field in stmt_vec_info struct.  */
5014
5015       /* If the dataref is in an inner-loop of the loop that is considered for
5016          for vectorization, we also want to analyze the access relative to
5017          the outer-loop (DR contains information only relative to the
5018          inner-most enclosing loop).  We do that by building a reference to the
5019          first location accessed by the inner-loop, and analyze it relative to
5020          the outer-loop.  */
5021       if (loop && nested_in_vect_loop_p (loop, stmt_info))
5022         {
5023           /* Build a reference to the first location accessed by the
5024              inner loop: *(BASE + INIT + OFFSET).  By construction,
5025              this address must be invariant in the inner loop, so we
5026              can consider it as being used in the outer loop.  */
5027           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
5028           tree offset = unshare_expr (DR_OFFSET (dr));
5029           tree init = unshare_expr (DR_INIT (dr));
5030           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
5031                                           init, offset);
5032           tree init_addr = fold_build_pointer_plus (base, init_offset);
5033           tree init_ref = build_fold_indirect_ref (init_addr);
5034
5035           if (dump_enabled_p ())
5036             dump_printf_loc (MSG_NOTE, vect_location,
5037                              "analyze in outer loop: %T\n", init_ref);
5038
5039           opt_result res
5040             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
5041                                     init_ref, loop, stmt_info->stmt);
5042           if (!res)
5043             /* dr_analyze_innermost already explained the failure.  */
5044             return res;
5045
5046           if (dump_enabled_p ())
5047             dump_printf_loc (MSG_NOTE, vect_location,
5048                              "\touter base_address: %T\n"
5049                              "\touter offset from base address: %T\n"
5050                              "\touter constant offset from base address: %T\n"
5051                              "\touter step: %T\n"
5052                              "\touter base alignment: %d\n\n"
5053                              "\touter base misalignment: %d\n"
5054                              "\touter offset alignment: %d\n"
5055                              "\touter step alignment: %d\n",
5056                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
5057                              STMT_VINFO_DR_OFFSET (stmt_info),
5058                              STMT_VINFO_DR_INIT (stmt_info),
5059                              STMT_VINFO_DR_STEP (stmt_info),
5060                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
5061                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
5062                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
5063                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
5064         }
5065
5066       /* Set vectype for STMT.  */
5067       scalar_type = TREE_TYPE (DR_REF (dr));
5068       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
5069       if (!vectype)
5070         {
5071           if (dump_enabled_p ())
5072             {
5073               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5074                                "not vectorized: no vectype for stmt: %G",
5075                                stmt_info->stmt);
5076               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
5077               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
5078                                  scalar_type);
5079               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5080             }
5081
5082           if (is_a <bb_vec_info> (vinfo))
5083             {
5084               /* No vector type is fine, the ref can still participate
5085                  in dependence analysis, we just can't vectorize it.  */
5086               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5087               continue;
5088             }
5089           if (fatal)
5090             *fatal = false;
5091           return opt_result::failure_at (stmt_info->stmt,
5092                                          "not vectorized:"
5093                                          " no vectype for stmt: %G"
5094                                          " scalar_type: %T\n",
5095                                          stmt_info->stmt, scalar_type);
5096         }
5097       else
5098         {
5099           if (dump_enabled_p ())
5100             dump_printf_loc (MSG_NOTE, vect_location,
5101                              "got vectype for stmt: %G%T\n",
5102                              stmt_info->stmt, vectype);
5103         }
5104
5105       /* Adjust the minimal vectorization factor according to the
5106          vector type.  */
5107       vf = TYPE_VECTOR_SUBPARTS (vectype);
5108       *min_vf = upper_bound (*min_vf, vf);
5109
5110       /* Leave the BB vectorizer to pick the vector type later, based on
5111          the final dataref group size and SLP node size.  */
5112       if (is_a <loop_vec_info> (vinfo))
5113         STMT_VINFO_VECTYPE (stmt_info) = vectype;
5114
5115       if (gatherscatter != SG_NONE)
5116         {
5117           gather_scatter_info gs_info;
5118           if (!vect_check_gather_scatter (stmt_info,
5119                                           as_a <loop_vec_info> (vinfo),
5120                                           &gs_info)
5121               || !get_vectype_for_scalar_type (vinfo,
5122                                                TREE_TYPE (gs_info.offset)))
5123             {
5124               if (fatal)
5125                 *fatal = false;
5126               return opt_result::failure_at
5127                         (stmt_info->stmt,
5128                          (gatherscatter == GATHER)
5129                          ? "not vectorized: not suitable for gather load %G"
5130                          : "not vectorized: not suitable for scatter store %G",
5131                          stmt_info->stmt);
5132             }
5133           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5134         }
5135     }
5136
5137   /* We used to stop processing and prune the list here.  Verify we no
5138      longer need to.  */
5139   gcc_assert (i == datarefs.length ());
5140
5141   return opt_result::success ();
5142 }
5143
5144
5145 /* Function vect_get_new_vect_var.
5146
5147    Returns a name for a new variable.  The current naming scheme appends the
5148    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5149    the name of vectorizer generated variables, and appends that to NAME if
5150    provided.  */
5151
5152 tree
5153 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5154 {
5155   const char *prefix;
5156   tree new_vect_var;
5157
5158   switch (var_kind)
5159   {
5160   case vect_simple_var:
5161     prefix = "vect";
5162     break;
5163   case vect_scalar_var:
5164     prefix = "stmp";
5165     break;
5166   case vect_mask_var:
5167     prefix = "mask";
5168     break;
5169   case vect_pointer_var:
5170     prefix = "vectp";
5171     break;
5172   default:
5173     gcc_unreachable ();
5174   }
5175
5176   if (name)
5177     {
5178       char* tmp = concat (prefix, "_", name, NULL);
5179       new_vect_var = create_tmp_reg (type, tmp);
5180       free (tmp);
5181     }
5182   else
5183     new_vect_var = create_tmp_reg (type, prefix);
5184
5185   return new_vect_var;
5186 }
5187
5188 /* Like vect_get_new_vect_var but return an SSA name.  */
5189
5190 tree
5191 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5192 {
5193   const char *prefix;
5194   tree new_vect_var;
5195
5196   switch (var_kind)
5197   {
5198   case vect_simple_var:
5199     prefix = "vect";
5200     break;
5201   case vect_scalar_var:
5202     prefix = "stmp";
5203     break;
5204   case vect_pointer_var:
5205     prefix = "vectp";
5206     break;
5207   default:
5208     gcc_unreachable ();
5209   }
5210
5211   if (name)
5212     {
5213       char* tmp = concat (prefix, "_", name, NULL);
5214       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5215       free (tmp);
5216     }
5217   else
5218     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5219
5220   return new_vect_var;
5221 }
5222
5223 /* Duplicate points-to info on NAME from DR_INFO.  */
5224
5225 static void
5226 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5227 {
5228   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5229   /* DR_PTR_INFO is for a base SSA name, not including constant or
5230      variable offsets in the ref so its alignment info does not apply.  */
5231   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5232 }
5233
5234 /* Function vect_create_addr_base_for_vector_ref.
5235
5236    Create an expression that computes the address of the first memory location
5237    that will be accessed for a data reference.
5238
5239    Input:
5240    STMT_INFO: The statement containing the data reference.
5241    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5242    OFFSET: Optional. If supplied, it is be added to the initial address.
5243    LOOP:    Specify relative to which loop-nest should the address be computed.
5244             For example, when the dataref is in an inner-loop nested in an
5245             outer-loop that is now being vectorized, LOOP can be either the
5246             outer-loop, or the inner-loop.  The first memory location accessed
5247             by the following dataref ('in' points to short):
5248
5249                 for (i=0; i<N; i++)
5250                    for (j=0; j<M; j++)
5251                      s += in[i+j]
5252
5253             is as follows:
5254             if LOOP=i_loop:     &in             (relative to i_loop)
5255             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5256
5257    Output:
5258    1. Return an SSA_NAME whose value is the address of the memory location of
5259       the first vector of the data reference.
5260    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5261       these statement(s) which define the returned SSA_NAME.
5262
5263    FORNOW: We are only handling array accesses with step 1.  */
5264
5265 tree
5266 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5267                                       gimple_seq *new_stmt_list,
5268                                       tree offset)
5269 {
5270   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5271   struct data_reference *dr = dr_info->dr;
5272   const char *base_name;
5273   tree addr_base;
5274   tree dest;
5275   gimple_seq seq = NULL;
5276   tree vect_ptr_type;
5277   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5278   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5279
5280   tree data_ref_base = unshare_expr (drb->base_address);
5281   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5282   tree init = unshare_expr (drb->init);
5283
5284   if (loop_vinfo)
5285     base_name = get_name (data_ref_base);
5286   else
5287     {
5288       base_offset = ssize_int (0);
5289       init = ssize_int (0);
5290       base_name = get_name (DR_REF (dr));
5291     }
5292
5293   /* Create base_offset */
5294   base_offset = size_binop (PLUS_EXPR,
5295                             fold_convert (sizetype, base_offset),
5296                             fold_convert (sizetype, init));
5297
5298   if (offset)
5299     {
5300       offset = fold_convert (sizetype, offset);
5301       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5302                                  base_offset, offset);
5303     }
5304
5305   /* base + base_offset */
5306   if (loop_vinfo)
5307     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5308   else
5309     addr_base = build1 (ADDR_EXPR,
5310                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5311                         /* Strip zero offset components since we don't need
5312                            them and they can confuse late diagnostics if
5313                            we CSE them wrongly.  See PR106904 for example.  */
5314                         unshare_expr (strip_zero_offset_components
5315                                                                 (DR_REF (dr))));
5316
5317   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5318   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5319   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5320   gimple_seq_add_seq (new_stmt_list, seq);
5321
5322   if (DR_PTR_INFO (dr)
5323       && TREE_CODE (addr_base) == SSA_NAME
5324       /* We should only duplicate pointer info to newly created SSA names.  */
5325       && SSA_NAME_VAR (addr_base) == dest)
5326     {
5327       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5328       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5329     }
5330
5331   if (dump_enabled_p ())
5332     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5333
5334   return addr_base;
5335 }
5336
5337
5338 /* Function vect_create_data_ref_ptr.
5339
5340    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5341    location accessed in the loop by STMT_INFO, along with the def-use update
5342    chain to appropriately advance the pointer through the loop iterations.
5343    Also set aliasing information for the pointer.  This pointer is used by
5344    the callers to this function to create a memory reference expression for
5345    vector load/store access.
5346
5347    Input:
5348    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5349          GIMPLE_ASSIGN <name, data-ref> or
5350          GIMPLE_ASSIGN <data-ref, name>.
5351    2. AGGR_TYPE: the type of the reference, which should be either a vector
5352         or an array.
5353    3. AT_LOOP: the loop where the vector memref is to be created.
5354    4. OFFSET (optional): a byte offset to be added to the initial address
5355         accessed by the data-ref in STMT_INFO.
5356    5. BSI: location where the new stmts are to be placed if there is no loop
5357    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5358         pointing to the initial address.
5359    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5360         to the IV during each iteration of the loop.  NULL says to move
5361         by one copy of AGGR_TYPE up or down, depending on the step of the
5362         data reference.
5363
5364    Output:
5365    1. Declare a new ptr to vector_type, and have it point to the base of the
5366       data reference (initial addressed accessed by the data reference).
5367       For example, for vector of type V8HI, the following code is generated:
5368
5369       v8hi *ap;
5370       ap = (v8hi *)initial_address;
5371
5372       if OFFSET is not supplied:
5373          initial_address = &a[init];
5374       if OFFSET is supplied:
5375          initial_address = &a[init] + OFFSET;
5376       if BYTE_OFFSET is supplied:
5377          initial_address = &a[init] + BYTE_OFFSET;
5378
5379       Return the initial_address in INITIAL_ADDRESS.
5380
5381    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5382       update the pointer in each iteration of the loop.
5383
5384       Return the increment stmt that updates the pointer in PTR_INCR.
5385
5386    3. Return the pointer.  */
5387
5388 tree
5389 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5390                           tree aggr_type, class loop *at_loop, tree offset,
5391                           tree *initial_address, gimple_stmt_iterator *gsi,
5392                           gimple **ptr_incr, bool only_init,
5393                           tree iv_step)
5394 {
5395   const char *base_name;
5396   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5397   class loop *loop = NULL;
5398   bool nested_in_vect_loop = false;
5399   class loop *containing_loop = NULL;
5400   tree aggr_ptr_type;
5401   tree aggr_ptr;
5402   tree new_temp;
5403   gimple_seq new_stmt_list = NULL;
5404   edge pe = NULL;
5405   basic_block new_bb;
5406   tree aggr_ptr_init;
5407   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5408   struct data_reference *dr = dr_info->dr;
5409   tree aptr;
5410   gimple_stmt_iterator incr_gsi;
5411   bool insert_after;
5412   tree indx_before_incr, indx_after_incr;
5413   gimple *incr;
5414   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5415
5416   gcc_assert (iv_step != NULL_TREE
5417               || TREE_CODE (aggr_type) == ARRAY_TYPE
5418               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5419
5420   if (loop_vinfo)
5421     {
5422       loop = LOOP_VINFO_LOOP (loop_vinfo);
5423       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5424       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5425       pe = loop_preheader_edge (loop);
5426     }
5427   else
5428     {
5429       gcc_assert (bb_vinfo);
5430       only_init = true;
5431       *ptr_incr = NULL;
5432     }
5433
5434   /* Create an expression for the first address accessed by this load
5435      in LOOP.  */
5436   base_name = get_name (DR_BASE_ADDRESS (dr));
5437
5438   if (dump_enabled_p ())
5439     {
5440       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5441       dump_printf_loc (MSG_NOTE, vect_location,
5442                        "create %s-pointer variable to type: %T",
5443                        get_tree_code_name (TREE_CODE (aggr_type)),
5444                        aggr_type);
5445       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5446         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5447       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5448         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5449       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5450         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5451       else
5452         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5453       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5454     }
5455
5456   /* (1) Create the new aggregate-pointer variable.
5457      Vector and array types inherit the alias set of their component
5458      type by default so we need to use a ref-all pointer if the data
5459      reference does not conflict with the created aggregated data
5460      reference because it is not addressable.  */
5461   bool need_ref_all = false;
5462   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5463                               get_alias_set (DR_REF (dr))))
5464     need_ref_all = true;
5465   /* Likewise for any of the data references in the stmt group.  */
5466   else if (DR_GROUP_SIZE (stmt_info) > 1)
5467     {
5468       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5469       do
5470         {
5471           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5472           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5473                                       get_alias_set (DR_REF (sdr))))
5474             {
5475               need_ref_all = true;
5476               break;
5477             }
5478           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5479         }
5480       while (sinfo);
5481     }
5482   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5483                                                need_ref_all);
5484   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5485
5486
5487   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5488      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5489      def-use update cycles for the pointer: one relative to the outer-loop
5490      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5491      to the inner-loop (which is the inner-most loop containing the dataref),
5492      and this is done be step (5) below.
5493
5494      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5495      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5496      redundant.  Steps (3),(4) create the following:
5497
5498         vp0 = &base_addr;
5499         LOOP:   vp1 = phi(vp0,vp2)
5500                 ...
5501                 ...
5502                 vp2 = vp1 + step
5503                 goto LOOP
5504
5505      If there is an inner-loop nested in loop, then step (5) will also be
5506      applied, and an additional update in the inner-loop will be created:
5507
5508         vp0 = &base_addr;
5509         LOOP:   vp1 = phi(vp0,vp2)
5510                 ...
5511         inner:     vp3 = phi(vp1,vp4)
5512                    vp4 = vp3 + inner_step
5513                    if () goto inner
5514                 ...
5515                 vp2 = vp1 + step
5516                 if () goto LOOP   */
5517
5518   /* (2) Calculate the initial address of the aggregate-pointer, and set
5519      the aggregate-pointer to point to it before the loop.  */
5520
5521   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5522
5523   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5524                                                    stmt_info, &new_stmt_list,
5525                                                    offset);
5526   if (new_stmt_list)
5527     {
5528       if (pe)
5529         {
5530           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5531           gcc_assert (!new_bb);
5532         }
5533       else
5534         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5535     }
5536
5537   *initial_address = new_temp;
5538   aggr_ptr_init = new_temp;
5539
5540   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5541      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5542      inner-loop nested in LOOP (during outer-loop vectorization).  */
5543
5544   /* No update in loop is required.  */
5545   if (only_init && (!loop_vinfo || at_loop == loop))
5546     aptr = aggr_ptr_init;
5547   else
5548     {
5549       /* Accesses to invariant addresses should be handled specially
5550          by the caller.  */
5551       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5552       gcc_assert (!integer_zerop (step));
5553
5554       if (iv_step == NULL_TREE)
5555         {
5556           /* The step of the aggregate pointer is the type size,
5557              negated for downward accesses.  */
5558           iv_step = TYPE_SIZE_UNIT (aggr_type);
5559           if (tree_int_cst_sgn (step) == -1)
5560             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5561         }
5562
5563       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5564
5565       create_iv (aggr_ptr_init, PLUS_EXPR,
5566                  fold_convert (aggr_ptr_type, iv_step),
5567                  aggr_ptr, loop, &incr_gsi, insert_after,
5568                  &indx_before_incr, &indx_after_incr);
5569       incr = gsi_stmt (incr_gsi);
5570
5571       /* Copy the points-to information if it exists. */
5572       if (DR_PTR_INFO (dr))
5573         {
5574           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5575           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5576         }
5577       if (ptr_incr)
5578         *ptr_incr = incr;
5579
5580       aptr = indx_before_incr;
5581     }
5582
5583   if (!nested_in_vect_loop || only_init)
5584     return aptr;
5585
5586
5587   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5588      nested in LOOP, if exists.  */
5589
5590   gcc_assert (nested_in_vect_loop);
5591   if (!only_init)
5592     {
5593       standard_iv_increment_position (containing_loop, &incr_gsi,
5594                                       &insert_after);
5595       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5596                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5597                  &indx_before_incr, &indx_after_incr);
5598       incr = gsi_stmt (incr_gsi);
5599
5600       /* Copy the points-to information if it exists. */
5601       if (DR_PTR_INFO (dr))
5602         {
5603           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5604           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5605         }
5606       if (ptr_incr)
5607         *ptr_incr = incr;
5608
5609       return indx_before_incr;
5610     }
5611   else
5612     gcc_unreachable ();
5613 }
5614
5615
5616 /* Function bump_vector_ptr
5617
5618    Increment a pointer (to a vector type) by vector-size. If requested,
5619    i.e. if PTR-INCR is given, then also connect the new increment stmt
5620    to the existing def-use update-chain of the pointer, by modifying
5621    the PTR_INCR as illustrated below:
5622
5623    The pointer def-use update-chain before this function:
5624                         DATAREF_PTR = phi (p_0, p_2)
5625                         ....
5626         PTR_INCR:       p_2 = DATAREF_PTR + step
5627
5628    The pointer def-use update-chain after this function:
5629                         DATAREF_PTR = phi (p_0, p_2)
5630                         ....
5631                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5632                         ....
5633         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5634
5635    Input:
5636    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5637                  in the loop.
5638    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5639               the loop.  The increment amount across iterations is expected
5640               to be vector_size.
5641    BSI - location where the new update stmt is to be placed.
5642    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5643    BUMP - optional. The offset by which to bump the pointer. If not given,
5644           the offset is assumed to be vector_size.
5645
5646    Output: Return NEW_DATAREF_PTR as illustrated above.
5647
5648 */
5649
5650 tree
5651 bump_vector_ptr (vec_info *vinfo,
5652                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5653                  stmt_vec_info stmt_info, tree bump)
5654 {
5655   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5656   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5657   tree update = TYPE_SIZE_UNIT (vectype);
5658   gimple *incr_stmt;
5659   ssa_op_iter iter;
5660   use_operand_p use_p;
5661   tree new_dataref_ptr;
5662
5663   if (bump)
5664     update = bump;
5665
5666   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5667     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5668   else if (is_gimple_min_invariant (dataref_ptr))
5669     /* When possible avoid emitting a separate increment stmt that will
5670        force the addressed object addressable.  */
5671     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5672                    fold_build2 (MEM_REF,
5673                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5674                                 dataref_ptr,
5675                                 fold_convert (ptr_type_node, update)));
5676   else
5677     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5678   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5679                                    dataref_ptr, update);
5680   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5681   /* Fold the increment, avoiding excessive chains use-def chains of
5682      those, leading to compile-time issues for passes until the next
5683      forwprop pass which would do this as well.  */
5684   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5685   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5686     {
5687       incr_stmt = gsi_stmt (fold_gsi);
5688       update_stmt (incr_stmt);
5689     }
5690
5691   /* Copy the points-to information if it exists. */
5692   if (DR_PTR_INFO (dr))
5693     {
5694       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5695       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5696     }
5697
5698   if (!ptr_incr)
5699     return new_dataref_ptr;
5700
5701   /* Update the vector-pointer's cross-iteration increment.  */
5702   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5703     {
5704       tree use = USE_FROM_PTR (use_p);
5705
5706       if (use == dataref_ptr)
5707         SET_USE (use_p, new_dataref_ptr);
5708       else
5709         gcc_assert (operand_equal_p (use, update, 0));
5710     }
5711
5712   return new_dataref_ptr;
5713 }
5714
5715
5716 /* Copy memory reference info such as base/clique from the SRC reference
5717    to the DEST MEM_REF.  */
5718
5719 void
5720 vect_copy_ref_info (tree dest, tree src)
5721 {
5722   if (TREE_CODE (dest) != MEM_REF)
5723     return;
5724
5725   tree src_base = src;
5726   while (handled_component_p (src_base))
5727     src_base = TREE_OPERAND (src_base, 0);
5728   if (TREE_CODE (src_base) != MEM_REF
5729       && TREE_CODE (src_base) != TARGET_MEM_REF)
5730     return;
5731
5732   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5733   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5734 }
5735
5736
5737 /* Function vect_create_destination_var.
5738
5739    Create a new temporary of type VECTYPE.  */
5740
5741 tree
5742 vect_create_destination_var (tree scalar_dest, tree vectype)
5743 {
5744   tree vec_dest;
5745   const char *name;
5746   char *new_name;
5747   tree type;
5748   enum vect_var_kind kind;
5749
5750   kind = vectype
5751     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5752     ? vect_mask_var
5753     : vect_simple_var
5754     : vect_scalar_var;
5755   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5756
5757   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5758
5759   name = get_name (scalar_dest);
5760   if (name)
5761     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5762   else
5763     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5764   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5765   free (new_name);
5766
5767   return vec_dest;
5768 }
5769
5770 /* Function vect_grouped_store_supported.
5771
5772    Returns TRUE if interleave high and interleave low permutations
5773    are supported, and FALSE otherwise.  */
5774
5775 bool
5776 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5777 {
5778   machine_mode mode = TYPE_MODE (vectype);
5779
5780   /* vect_permute_store_chain requires the group size to be equal to 3 or
5781      be a power of two.  */
5782   if (count != 3 && exact_log2 (count) == -1)
5783     {
5784       if (dump_enabled_p ())
5785         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786                          "the size of the group of accesses"
5787                          " is not a power of 2 or not eqaul to 3\n");
5788       return false;
5789     }
5790
5791   /* Check that the permutation is supported.  */
5792   if (VECTOR_MODE_P (mode))
5793     {
5794       unsigned int i;
5795       if (count == 3)
5796         {
5797           unsigned int j0 = 0, j1 = 0, j2 = 0;
5798           unsigned int i, j;
5799
5800           unsigned int nelt;
5801           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5802             {
5803               if (dump_enabled_p ())
5804                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805                                  "cannot handle groups of 3 stores for"
5806                                  " variable-length vectors\n");
5807               return false;
5808             }
5809
5810           vec_perm_builder sel (nelt, nelt, 1);
5811           sel.quick_grow (nelt);
5812           vec_perm_indices indices;
5813           for (j = 0; j < 3; j++)
5814             {
5815               int nelt0 = ((3 - j) * nelt) % 3;
5816               int nelt1 = ((3 - j) * nelt + 1) % 3;
5817               int nelt2 = ((3 - j) * nelt + 2) % 3;
5818               for (i = 0; i < nelt; i++)
5819                 {
5820                   if (3 * i + nelt0 < nelt)
5821                     sel[3 * i + nelt0] = j0++;
5822                   if (3 * i + nelt1 < nelt)
5823                     sel[3 * i + nelt1] = nelt + j1++;
5824                   if (3 * i + nelt2 < nelt)
5825                     sel[3 * i + nelt2] = 0;
5826                 }
5827               indices.new_vector (sel, 2, nelt);
5828               if (!can_vec_perm_const_p (mode, mode, indices))
5829                 {
5830                   if (dump_enabled_p ())
5831                     dump_printf (MSG_MISSED_OPTIMIZATION,
5832                                  "permutation op not supported by target.\n");
5833                   return false;
5834                 }
5835
5836               for (i = 0; i < nelt; i++)
5837                 {
5838                   if (3 * i + nelt0 < nelt)
5839                     sel[3 * i + nelt0] = 3 * i + nelt0;
5840                   if (3 * i + nelt1 < nelt)
5841                     sel[3 * i + nelt1] = 3 * i + nelt1;
5842                   if (3 * i + nelt2 < nelt)
5843                     sel[3 * i + nelt2] = nelt + j2++;
5844                 }
5845               indices.new_vector (sel, 2, nelt);
5846               if (!can_vec_perm_const_p (mode, mode, indices))
5847                 {
5848                   if (dump_enabled_p ())
5849                     dump_printf (MSG_MISSED_OPTIMIZATION,
5850                                  "permutation op not supported by target.\n");
5851                   return false;
5852                 }
5853             }
5854           return true;
5855         }
5856       else
5857         {
5858           /* If length is not equal to 3 then only power of 2 is supported.  */
5859           gcc_assert (pow2p_hwi (count));
5860           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5861
5862           /* The encoding has 2 interleaved stepped patterns.  */
5863           if(!multiple_p (nelt, 2))
5864             return false;
5865           vec_perm_builder sel (nelt, 2, 3);
5866           sel.quick_grow (6);
5867           for (i = 0; i < 3; i++)
5868             {
5869               sel[i * 2] = i;
5870               sel[i * 2 + 1] = i + nelt;
5871             }
5872           vec_perm_indices indices (sel, 2, nelt);
5873           if (can_vec_perm_const_p (mode, mode, indices))
5874             {
5875               for (i = 0; i < 6; i++)
5876                 sel[i] += exact_div (nelt, 2);
5877               indices.new_vector (sel, 2, nelt);
5878               if (can_vec_perm_const_p (mode, mode, indices))
5879                 return true;
5880             }
5881         }
5882     }
5883
5884   if (dump_enabled_p ())
5885     dump_printf (MSG_MISSED_OPTIMIZATION,
5886                  "permutation op not supported by target.\n");
5887   return false;
5888 }
5889
5890 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5891    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5892
5893 internal_fn
5894 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5895                             bool masked_p)
5896 {
5897   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5898                                     vec_mask_len_store_lanes_optab, vectype,
5899                                     count))
5900     return IFN_MASK_LEN_STORE_LANES;
5901   else if (masked_p)
5902     {
5903       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5904                                         vec_mask_store_lanes_optab, vectype,
5905                                         count))
5906         return IFN_MASK_STORE_LANES;
5907     }
5908   else
5909     {
5910       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5911                                         vec_store_lanes_optab, vectype, count))
5912         return IFN_STORE_LANES;
5913     }
5914   return IFN_LAST;
5915 }
5916
5917
5918 /* Function vect_permute_store_chain.
5919
5920    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5921    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5922    the data correctly for the stores.  Return the final references for stores
5923    in RESULT_CHAIN.
5924
5925    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5926    The input is 4 vectors each containing 8 elements.  We assign a number to
5927    each element, the input sequence is:
5928
5929    1st vec:   0  1  2  3  4  5  6  7
5930    2nd vec:   8  9 10 11 12 13 14 15
5931    3rd vec:  16 17 18 19 20 21 22 23
5932    4th vec:  24 25 26 27 28 29 30 31
5933
5934    The output sequence should be:
5935
5936    1st vec:  0  8 16 24  1  9 17 25
5937    2nd vec:  2 10 18 26  3 11 19 27
5938    3rd vec:  4 12 20 28  5 13 21 30
5939    4th vec:  6 14 22 30  7 15 23 31
5940
5941    i.e., we interleave the contents of the four vectors in their order.
5942
5943    We use interleave_high/low instructions to create such output.  The input of
5944    each interleave_high/low operation is two vectors:
5945    1st vec    2nd vec
5946    0 1 2 3    4 5 6 7
5947    the even elements of the result vector are obtained left-to-right from the
5948    high/low elements of the first vector.  The odd elements of the result are
5949    obtained left-to-right from the high/low elements of the second vector.
5950    The output of interleave_high will be:   0 4 1 5
5951    and of interleave_low:                   2 6 3 7
5952
5953
5954    The permutation is done in log LENGTH stages.  In each stage interleave_high
5955    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5956    where the first argument is taken from the first half of DR_CHAIN and the
5957    second argument from it's second half.
5958    In our example,
5959
5960    I1: interleave_high (1st vec, 3rd vec)
5961    I2: interleave_low (1st vec, 3rd vec)
5962    I3: interleave_high (2nd vec, 4th vec)
5963    I4: interleave_low (2nd vec, 4th vec)
5964
5965    The output for the first stage is:
5966
5967    I1:  0 16  1 17  2 18  3 19
5968    I2:  4 20  5 21  6 22  7 23
5969    I3:  8 24  9 25 10 26 11 27
5970    I4: 12 28 13 29 14 30 15 31
5971
5972    The output of the second stage, i.e. the final result is:
5973
5974    I1:  0  8 16 24  1  9 17 25
5975    I2:  2 10 18 26  3 11 19 27
5976    I3:  4 12 20 28  5 13 21 30
5977    I4:  6 14 22 30  7 15 23 31.  */
5978
5979 void
5980 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5981                           unsigned int length,
5982                           stmt_vec_info stmt_info,
5983                           gimple_stmt_iterator *gsi,
5984                           vec<tree> *result_chain)
5985 {
5986   tree vect1, vect2, high, low;
5987   gimple *perm_stmt;
5988   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5989   tree perm_mask_low, perm_mask_high;
5990   tree data_ref;
5991   tree perm3_mask_low, perm3_mask_high;
5992   unsigned int i, j, n, log_length = exact_log2 (length);
5993
5994   result_chain->quick_grow (length);
5995   memcpy (result_chain->address (), dr_chain.address (),
5996           length * sizeof (tree));
5997
5998   if (length == 3)
5999     {
6000       /* vect_grouped_store_supported ensures that this is constant.  */
6001       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6002       unsigned int j0 = 0, j1 = 0, j2 = 0;
6003
6004       vec_perm_builder sel (nelt, nelt, 1);
6005       sel.quick_grow (nelt);
6006       vec_perm_indices indices;
6007       for (j = 0; j < 3; j++)
6008         {
6009           int nelt0 = ((3 - j) * nelt) % 3;
6010           int nelt1 = ((3 - j) * nelt + 1) % 3;
6011           int nelt2 = ((3 - j) * nelt + 2) % 3;
6012
6013           for (i = 0; i < nelt; i++)
6014             {
6015               if (3 * i + nelt0 < nelt)
6016                 sel[3 * i + nelt0] = j0++;
6017               if (3 * i + nelt1 < nelt)
6018                 sel[3 * i + nelt1] = nelt + j1++;
6019               if (3 * i + nelt2 < nelt)
6020                 sel[3 * i + nelt2] = 0;
6021             }
6022           indices.new_vector (sel, 2, nelt);
6023           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6024
6025           for (i = 0; i < nelt; i++)
6026             {
6027               if (3 * i + nelt0 < nelt)
6028                 sel[3 * i + nelt0] = 3 * i + nelt0;
6029               if (3 * i + nelt1 < nelt)
6030                 sel[3 * i + nelt1] = 3 * i + nelt1;
6031               if (3 * i + nelt2 < nelt)
6032                 sel[3 * i + nelt2] = nelt + j2++;
6033             }
6034           indices.new_vector (sel, 2, nelt);
6035           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6036
6037           vect1 = dr_chain[0];
6038           vect2 = dr_chain[1];
6039
6040           /* Create interleaving stmt:
6041              low = VEC_PERM_EXPR <vect1, vect2,
6042                                   {j, nelt, *, j + 1, nelt + j + 1, *,
6043                                    j + 2, nelt + j + 2, *, ...}>  */
6044           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6045           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6046                                            vect2, perm3_mask_low);
6047           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6048
6049           vect1 = data_ref;
6050           vect2 = dr_chain[2];
6051           /* Create interleaving stmt:
6052              low = VEC_PERM_EXPR <vect1, vect2,
6053                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
6054                                    6, 7, nelt + j + 2, ...}>  */
6055           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6056           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6057                                            vect2, perm3_mask_high);
6058           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6059           (*result_chain)[j] = data_ref;
6060         }
6061     }
6062   else
6063     {
6064       /* If length is not equal to 3 then only power of 2 is supported.  */
6065       gcc_assert (pow2p_hwi (length));
6066
6067       /* The encoding has 2 interleaved stepped patterns.  */
6068       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6069       vec_perm_builder sel (nelt, 2, 3);
6070       sel.quick_grow (6);
6071       for (i = 0; i < 3; i++)
6072         {
6073           sel[i * 2] = i;
6074           sel[i * 2 + 1] = i + nelt;
6075         }
6076         vec_perm_indices indices (sel, 2, nelt);
6077         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6078
6079         for (i = 0; i < 6; i++)
6080           sel[i] += exact_div (nelt, 2);
6081         indices.new_vector (sel, 2, nelt);
6082         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6083
6084         for (i = 0, n = log_length; i < n; i++)
6085           {
6086             for (j = 0; j < length/2; j++)
6087               {
6088                 vect1 = dr_chain[j];
6089                 vect2 = dr_chain[j+length/2];
6090
6091                 /* Create interleaving stmt:
6092                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
6093                                                         ...}>  */
6094                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
6095                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
6096                                                  vect2, perm_mask_high);
6097                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6098                 (*result_chain)[2*j] = high;
6099
6100                 /* Create interleaving stmt:
6101                    low = VEC_PERM_EXPR <vect1, vect2,
6102                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
6103                                          ...}>  */
6104                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
6105                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
6106                                                  vect2, perm_mask_low);
6107                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6108                 (*result_chain)[2*j+1] = low;
6109               }
6110             memcpy (dr_chain.address (), result_chain->address (),
6111                     length * sizeof (tree));
6112           }
6113     }
6114 }
6115
6116 /* Function vect_setup_realignment
6117
6118    This function is called when vectorizing an unaligned load using
6119    the dr_explicit_realign[_optimized] scheme.
6120    This function generates the following code at the loop prolog:
6121
6122       p = initial_addr;
6123    x  msq_init = *(floor(p));   # prolog load
6124       realignment_token = call target_builtin;
6125     loop:
6126    x  msq = phi (msq_init, ---)
6127
6128    The stmts marked with x are generated only for the case of
6129    dr_explicit_realign_optimized.
6130
6131    The code above sets up a new (vector) pointer, pointing to the first
6132    location accessed by STMT_INFO, and a "floor-aligned" load using that
6133    pointer.  It also generates code to compute the "realignment-token"
6134    (if the relevant target hook was defined), and creates a phi-node at the
6135    loop-header bb whose arguments are the result of the prolog-load (created
6136    by this function) and the result of a load that takes place in the loop
6137    (to be created by the caller to this function).
6138
6139    For the case of dr_explicit_realign_optimized:
6140    The caller to this function uses the phi-result (msq) to create the
6141    realignment code inside the loop, and sets up the missing phi argument,
6142    as follows:
6143     loop:
6144       msq = phi (msq_init, lsq)
6145       lsq = *(floor(p'));        # load in loop
6146       result = realign_load (msq, lsq, realignment_token);
6147
6148    For the case of dr_explicit_realign:
6149     loop:
6150       msq = *(floor(p));        # load in loop
6151       p' = p + (VS-1);
6152       lsq = *(floor(p'));       # load in loop
6153       result = realign_load (msq, lsq, realignment_token);
6154
6155    Input:
6156    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6157                a memory location that may be unaligned.
6158    BSI - place where new code is to be inserted.
6159    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6160                               is used.
6161
6162    Output:
6163    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6164                        target hook, if defined.
6165    Return value - the result of the loop-header phi node.  */
6166
6167 tree
6168 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6169                         gimple_stmt_iterator *gsi, tree *realignment_token,
6170                         enum dr_alignment_support alignment_support_scheme,
6171                         tree init_addr,
6172                         class loop **at_loop)
6173 {
6174   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6175   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6176   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6177   struct data_reference *dr = dr_info->dr;
6178   class loop *loop = NULL;
6179   edge pe = NULL;
6180   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6181   tree vec_dest;
6182   gimple *inc;
6183   tree ptr;
6184   tree data_ref;
6185   basic_block new_bb;
6186   tree msq_init = NULL_TREE;
6187   tree new_temp;
6188   gphi *phi_stmt;
6189   tree msq = NULL_TREE;
6190   gimple_seq stmts = NULL;
6191   bool compute_in_loop = false;
6192   bool nested_in_vect_loop = false;
6193   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6194   class loop *loop_for_initial_load = NULL;
6195
6196   if (loop_vinfo)
6197     {
6198       loop = LOOP_VINFO_LOOP (loop_vinfo);
6199       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6200     }
6201
6202   gcc_assert (alignment_support_scheme == dr_explicit_realign
6203               || alignment_support_scheme == dr_explicit_realign_optimized);
6204
6205   /* We need to generate three things:
6206      1. the misalignment computation
6207      2. the extra vector load (for the optimized realignment scheme).
6208      3. the phi node for the two vectors from which the realignment is
6209       done (for the optimized realignment scheme).  */
6210
6211   /* 1. Determine where to generate the misalignment computation.
6212
6213      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6214      calculation will be generated by this function, outside the loop (in the
6215      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6216      caller, inside the loop.
6217
6218      Background: If the misalignment remains fixed throughout the iterations of
6219      the loop, then both realignment schemes are applicable, and also the
6220      misalignment computation can be done outside LOOP.  This is because we are
6221      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6222      are a multiple of VS (the Vector Size), and therefore the misalignment in
6223      different vectorized LOOP iterations is always the same.
6224      The problem arises only if the memory access is in an inner-loop nested
6225      inside LOOP, which is now being vectorized using outer-loop vectorization.
6226      This is the only case when the misalignment of the memory access may not
6227      remain fixed throughout the iterations of the inner-loop (as explained in
6228      detail in vect_supportable_dr_alignment).  In this case, not only is the
6229      optimized realignment scheme not applicable, but also the misalignment
6230      computation (and generation of the realignment token that is passed to
6231      REALIGN_LOAD) have to be done inside the loop.
6232
6233      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6234      or not, which in turn determines if the misalignment is computed inside
6235      the inner-loop, or outside LOOP.  */
6236
6237   if (init_addr != NULL_TREE || !loop_vinfo)
6238     {
6239       compute_in_loop = true;
6240       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6241     }
6242
6243
6244   /* 2. Determine where to generate the extra vector load.
6245
6246      For the optimized realignment scheme, instead of generating two vector
6247      loads in each iteration, we generate a single extra vector load in the
6248      preheader of the loop, and in each iteration reuse the result of the
6249      vector load from the previous iteration.  In case the memory access is in
6250      an inner-loop nested inside LOOP, which is now being vectorized using
6251      outer-loop vectorization, we need to determine whether this initial vector
6252      load should be generated at the preheader of the inner-loop, or can be
6253      generated at the preheader of LOOP.  If the memory access has no evolution
6254      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6255      to be generated inside LOOP (in the preheader of the inner-loop).  */
6256
6257   if (nested_in_vect_loop)
6258     {
6259       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6260       bool invariant_in_outerloop =
6261             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6262       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6263     }
6264   else
6265     loop_for_initial_load = loop;
6266   if (at_loop)
6267     *at_loop = loop_for_initial_load;
6268
6269   tree vuse = NULL_TREE;
6270   if (loop_for_initial_load)
6271     {
6272       pe = loop_preheader_edge (loop_for_initial_load);
6273       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6274         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6275     }
6276   if (!vuse)
6277     vuse = gimple_vuse (gsi_stmt (*gsi));
6278
6279   /* 3. For the case of the optimized realignment, create the first vector
6280       load at the loop preheader.  */
6281
6282   if (alignment_support_scheme == dr_explicit_realign_optimized)
6283     {
6284       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6285       gassign *new_stmt;
6286
6287       gcc_assert (!compute_in_loop);
6288       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6289       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6290                                       loop_for_initial_load, NULL_TREE,
6291                                       &init_addr, NULL, &inc, true);
6292       if (TREE_CODE (ptr) == SSA_NAME)
6293         new_temp = copy_ssa_name (ptr);
6294       else
6295         new_temp = make_ssa_name (TREE_TYPE (ptr));
6296       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6297       tree type = TREE_TYPE (ptr);
6298       new_stmt = gimple_build_assign
6299                    (new_temp, BIT_AND_EXPR, ptr,
6300                     fold_build2 (MINUS_EXPR, type,
6301                                  build_int_cst (type, 0),
6302                                  build_int_cst (type, align)));
6303       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6304       gcc_assert (!new_bb);
6305       data_ref
6306         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6307                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6308       vect_copy_ref_info (data_ref, DR_REF (dr));
6309       new_stmt = gimple_build_assign (vec_dest, data_ref);
6310       new_temp = make_ssa_name (vec_dest, new_stmt);
6311       gimple_assign_set_lhs (new_stmt, new_temp);
6312       gimple_set_vuse (new_stmt, vuse);
6313       if (pe)
6314         {
6315           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6316           gcc_assert (!new_bb);
6317         }
6318       else
6319          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6320
6321       msq_init = gimple_assign_lhs (new_stmt);
6322     }
6323
6324   /* 4. Create realignment token using a target builtin, if available.
6325       It is done either inside the containing loop, or before LOOP (as
6326       determined above).  */
6327
6328   if (targetm.vectorize.builtin_mask_for_load)
6329     {
6330       gcall *new_stmt;
6331       tree builtin_decl;
6332
6333       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6334       if (!init_addr)
6335         {
6336           /* Generate the INIT_ADDR computation outside LOOP.  */
6337           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6338                                                             stmt_info, &stmts,
6339                                                             NULL_TREE);
6340           if (loop)
6341             {
6342               pe = loop_preheader_edge (loop);
6343               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6344               gcc_assert (!new_bb);
6345             }
6346           else
6347              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6348         }
6349
6350       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6351       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6352       vec_dest =
6353         vect_create_destination_var (scalar_dest,
6354                                      gimple_call_return_type (new_stmt));
6355       new_temp = make_ssa_name (vec_dest, new_stmt);
6356       gimple_call_set_lhs (new_stmt, new_temp);
6357
6358       if (compute_in_loop)
6359         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6360       else
6361         {
6362           /* Generate the misalignment computation outside LOOP.  */
6363           pe = loop_preheader_edge (loop);
6364           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6365           gcc_assert (!new_bb);
6366         }
6367
6368       *realignment_token = gimple_call_lhs (new_stmt);
6369
6370       /* The result of the CALL_EXPR to this builtin is determined from
6371          the value of the parameter and no global variables are touched
6372          which makes the builtin a "const" function.  Requiring the
6373          builtin to have the "const" attribute makes it unnecessary
6374          to call mark_call_clobbered.  */
6375       gcc_assert (TREE_READONLY (builtin_decl));
6376     }
6377
6378   if (alignment_support_scheme == dr_explicit_realign)
6379     return msq;
6380
6381   gcc_assert (!compute_in_loop);
6382   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6383
6384
6385   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6386
6387   pe = loop_preheader_edge (containing_loop);
6388   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6389   msq = make_ssa_name (vec_dest);
6390   phi_stmt = create_phi_node (msq, containing_loop->header);
6391   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6392
6393   return msq;
6394 }
6395
6396
6397 /* Function vect_grouped_load_supported.
6398
6399    COUNT is the size of the load group (the number of statements plus the
6400    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6401    only one statement, with a gap of COUNT - 1.
6402
6403    Returns true if a suitable permute exists.  */
6404
6405 bool
6406 vect_grouped_load_supported (tree vectype, bool single_element_p,
6407                              unsigned HOST_WIDE_INT count)
6408 {
6409   machine_mode mode = TYPE_MODE (vectype);
6410
6411   /* If this is single-element interleaving with an element distance
6412      that leaves unused vector loads around punt - we at least create
6413      very sub-optimal code in that case (and blow up memory,
6414      see PR65518).  */
6415   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6416     {
6417       if (dump_enabled_p ())
6418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6419                          "single-element interleaving not supported "
6420                          "for not adjacent vector loads\n");
6421       return false;
6422     }
6423
6424   /* vect_permute_load_chain requires the group size to be equal to 3 or
6425      be a power of two.  */
6426   if (count != 3 && exact_log2 (count) == -1)
6427     {
6428       if (dump_enabled_p ())
6429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6430                          "the size of the group of accesses"
6431                          " is not a power of 2 or not equal to 3\n");
6432       return false;
6433     }
6434
6435   /* Check that the permutation is supported.  */
6436   if (VECTOR_MODE_P (mode))
6437     {
6438       unsigned int i, j;
6439       if (count == 3)
6440         {
6441           unsigned int nelt;
6442           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6443             {
6444               if (dump_enabled_p ())
6445                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446                                  "cannot handle groups of 3 loads for"
6447                                  " variable-length vectors\n");
6448               return false;
6449             }
6450
6451           vec_perm_builder sel (nelt, nelt, 1);
6452           sel.quick_grow (nelt);
6453           vec_perm_indices indices;
6454           unsigned int k;
6455           for (k = 0; k < 3; k++)
6456             {
6457               for (i = 0; i < nelt; i++)
6458                 if (3 * i + k < 2 * nelt)
6459                   sel[i] = 3 * i + k;
6460                 else
6461                   sel[i] = 0;
6462               indices.new_vector (sel, 2, nelt);
6463               if (!can_vec_perm_const_p (mode, mode, indices))
6464                 {
6465                   if (dump_enabled_p ())
6466                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6467                                      "shuffle of 3 loads is not supported by"
6468                                      " target\n");
6469                   return false;
6470                 }
6471               for (i = 0, j = 0; i < nelt; i++)
6472                 if (3 * i + k < 2 * nelt)
6473                   sel[i] = i;
6474                 else
6475                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6476               indices.new_vector (sel, 2, nelt);
6477               if (!can_vec_perm_const_p (mode, mode, indices))
6478                 {
6479                   if (dump_enabled_p ())
6480                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6481                                      "shuffle of 3 loads is not supported by"
6482                                      " target\n");
6483                   return false;
6484                 }
6485             }
6486           return true;
6487         }
6488       else
6489         {
6490           /* If length is not equal to 3 then only power of 2 is supported.  */
6491           gcc_assert (pow2p_hwi (count));
6492           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6493
6494           /* The encoding has a single stepped pattern.  */
6495           vec_perm_builder sel (nelt, 1, 3);
6496           sel.quick_grow (3);
6497           for (i = 0; i < 3; i++)
6498             sel[i] = i * 2;
6499           vec_perm_indices indices (sel, 2, nelt);
6500           if (can_vec_perm_const_p (mode, mode, indices))
6501             {
6502               for (i = 0; i < 3; i++)
6503                 sel[i] = i * 2 + 1;
6504               indices.new_vector (sel, 2, nelt);
6505               if (can_vec_perm_const_p (mode, mode, indices))
6506                 return true;
6507             }
6508         }
6509     }
6510
6511   if (dump_enabled_p ())
6512     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513                      "extract even/odd not supported by target\n");
6514   return false;
6515 }
6516
6517 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6518    of type VECTYPE.  MASKED_P says whether the masked form is needed.
6519    If it is available and ELSVALS is nonzero store the possible else values
6520    in the vector it points to.  */
6521
6522 internal_fn
6523 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6524                            bool masked_p, vec<int> *elsvals)
6525 {
6526   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6527                                     vec_mask_len_load_lanes_optab, vectype,
6528                                     count, elsvals))
6529     return IFN_MASK_LEN_LOAD_LANES;
6530   else if (masked_p)
6531     {
6532       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6533                                         vec_mask_load_lanes_optab, vectype,
6534                                         count, elsvals))
6535         return IFN_MASK_LOAD_LANES;
6536     }
6537   else
6538     {
6539       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6540                                         vectype, count, elsvals))
6541         return IFN_LOAD_LANES;
6542     }
6543   return IFN_LAST;
6544 }
6545
6546 /* Function vect_permute_load_chain.
6547
6548    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6549    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6550    the input data correctly.  Return the final references for loads in
6551    RESULT_CHAIN.
6552
6553    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6554    The input is 4 vectors each containing 8 elements. We assign a number to each
6555    element, the input sequence is:
6556
6557    1st vec:   0  1  2  3  4  5  6  7
6558    2nd vec:   8  9 10 11 12 13 14 15
6559    3rd vec:  16 17 18 19 20 21 22 23
6560    4th vec:  24 25 26 27 28 29 30 31
6561
6562    The output sequence should be:
6563
6564    1st vec:  0 4  8 12 16 20 24 28
6565    2nd vec:  1 5  9 13 17 21 25 29
6566    3rd vec:  2 6 10 14 18 22 26 30
6567    4th vec:  3 7 11 15 19 23 27 31
6568
6569    i.e., the first output vector should contain the first elements of each
6570    interleaving group, etc.
6571
6572    We use extract_even/odd instructions to create such output.  The input of
6573    each extract_even/odd operation is two vectors
6574    1st vec    2nd vec
6575    0 1 2 3    4 5 6 7
6576
6577    and the output is the vector of extracted even/odd elements.  The output of
6578    extract_even will be:   0 2 4 6
6579    and of extract_odd:     1 3 5 7
6580
6581
6582    The permutation is done in log LENGTH stages.  In each stage extract_even
6583    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6584    their order.  In our example,
6585
6586    E1: extract_even (1st vec, 2nd vec)
6587    E2: extract_odd (1st vec, 2nd vec)
6588    E3: extract_even (3rd vec, 4th vec)
6589    E4: extract_odd (3rd vec, 4th vec)
6590
6591    The output for the first stage will be:
6592
6593    E1:  0  2  4  6  8 10 12 14
6594    E2:  1  3  5  7  9 11 13 15
6595    E3: 16 18 20 22 24 26 28 30
6596    E4: 17 19 21 23 25 27 29 31
6597
6598    In order to proceed and create the correct sequence for the next stage (or
6599    for the correct output, if the second stage is the last one, as in our
6600    example), we first put the output of extract_even operation and then the
6601    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6602    The input for the second stage is:
6603
6604    1st vec (E1):  0  2  4  6  8 10 12 14
6605    2nd vec (E3): 16 18 20 22 24 26 28 30
6606    3rd vec (E2):  1  3  5  7  9 11 13 15
6607    4th vec (E4): 17 19 21 23 25 27 29 31
6608
6609    The output of the second stage:
6610
6611    E1: 0 4  8 12 16 20 24 28
6612    E2: 2 6 10 14 18 22 26 30
6613    E3: 1 5  9 13 17 21 25 29
6614    E4: 3 7 11 15 19 23 27 31
6615
6616    And RESULT_CHAIN after reordering:
6617
6618    1st vec (E1):  0 4  8 12 16 20 24 28
6619    2nd vec (E3):  1 5  9 13 17 21 25 29
6620    3rd vec (E2):  2 6 10 14 18 22 26 30
6621    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6622
6623 static void
6624 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6625                          unsigned int length,
6626                          stmt_vec_info stmt_info,
6627                          gimple_stmt_iterator *gsi,
6628                          vec<tree> *result_chain)
6629 {
6630   tree data_ref, first_vect, second_vect;
6631   tree perm_mask_even, perm_mask_odd;
6632   tree perm3_mask_low, perm3_mask_high;
6633   gimple *perm_stmt;
6634   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6635   unsigned int i, j, log_length = exact_log2 (length);
6636
6637   result_chain->quick_grow (length);
6638   memcpy (result_chain->address (), dr_chain.address (),
6639           length * sizeof (tree));
6640
6641   if (length == 3)
6642     {
6643       /* vect_grouped_load_supported ensures that this is constant.  */
6644       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6645       unsigned int k;
6646
6647       vec_perm_builder sel (nelt, nelt, 1);
6648       sel.quick_grow (nelt);
6649       vec_perm_indices indices;
6650       for (k = 0; k < 3; k++)
6651         {
6652           for (i = 0; i < nelt; i++)
6653             if (3 * i + k < 2 * nelt)
6654               sel[i] = 3 * i + k;
6655             else
6656               sel[i] = 0;
6657           indices.new_vector (sel, 2, nelt);
6658           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6659
6660           for (i = 0, j = 0; i < nelt; i++)
6661             if (3 * i + k < 2 * nelt)
6662               sel[i] = i;
6663             else
6664               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6665           indices.new_vector (sel, 2, nelt);
6666           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6667
6668           first_vect = dr_chain[0];
6669           second_vect = dr_chain[1];
6670
6671           /* Create interleaving stmt (low part of):
6672              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6673                                                              ...}>  */
6674           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6675           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6676                                            second_vect, perm3_mask_low);
6677           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6678
6679           /* Create interleaving stmt (high part of):
6680              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6681                                                               ...}>  */
6682           first_vect = data_ref;
6683           second_vect = dr_chain[2];
6684           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6685           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6686                                            second_vect, perm3_mask_high);
6687           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6688           (*result_chain)[k] = data_ref;
6689         }
6690     }
6691   else
6692     {
6693       /* If length is not equal to 3 then only power of 2 is supported.  */
6694       gcc_assert (pow2p_hwi (length));
6695
6696       /* The encoding has a single stepped pattern.  */
6697       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6698       vec_perm_builder sel (nelt, 1, 3);
6699       sel.quick_grow (3);
6700       for (i = 0; i < 3; ++i)
6701         sel[i] = i * 2;
6702       vec_perm_indices indices (sel, 2, nelt);
6703       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6704
6705       for (i = 0; i < 3; ++i)
6706         sel[i] = i * 2 + 1;
6707       indices.new_vector (sel, 2, nelt);
6708       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6709
6710       for (i = 0; i < log_length; i++)
6711         {
6712           for (j = 0; j < length; j += 2)
6713             {
6714               first_vect = dr_chain[j];
6715               second_vect = dr_chain[j+1];
6716
6717               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6718               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6719               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6720                                                first_vect, second_vect,
6721                                                perm_mask_even);
6722               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6723               (*result_chain)[j/2] = data_ref;
6724
6725               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6726               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6727               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6728                                                first_vect, second_vect,
6729                                                perm_mask_odd);
6730               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6731               (*result_chain)[j/2+length/2] = data_ref;
6732             }
6733           memcpy (dr_chain.address (), result_chain->address (),
6734                   length * sizeof (tree));
6735         }
6736     }
6737 }
6738
6739 /* Function vect_shift_permute_load_chain.
6740
6741    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6742    sequence of stmts to reorder the input data accordingly.
6743    Return the final references for loads in RESULT_CHAIN.
6744    Return true if successed, false otherwise.
6745
6746    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6747    The input is 3 vectors each containing 8 elements.  We assign a
6748    number to each element, the input sequence is:
6749
6750    1st vec:   0  1  2  3  4  5  6  7
6751    2nd vec:   8  9 10 11 12 13 14 15
6752    3rd vec:  16 17 18 19 20 21 22 23
6753
6754    The output sequence should be:
6755
6756    1st vec:  0 3 6  9 12 15 18 21
6757    2nd vec:  1 4 7 10 13 16 19 22
6758    3rd vec:  2 5 8 11 14 17 20 23
6759
6760    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6761
6762    First we shuffle all 3 vectors to get correct elements order:
6763
6764    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6765    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6766    3rd vec:  (16 19 22) (17 20 23) (18 21)
6767
6768    Next we unite and shift vector 3 times:
6769
6770    1st step:
6771      shift right by 6 the concatenation of:
6772      "1st vec" and  "2nd vec"
6773        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6774      "2nd vec" and  "3rd vec"
6775        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6776      "3rd vec" and  "1st vec"
6777        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6778                              | New vectors                   |
6779
6780      So that now new vectors are:
6781
6782      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6783      2nd vec:  (10 13) (16 19 22) (17 20 23)
6784      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6785
6786    2nd step:
6787      shift right by 5 the concatenation of:
6788      "1st vec" and  "3rd vec"
6789        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6790      "2nd vec" and  "1st vec"
6791        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6792      "3rd vec" and  "2nd vec"
6793        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6794                           | New vectors                   |
6795
6796      So that now new vectors are:
6797
6798      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6799      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6800      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6801
6802    3rd step:
6803      shift right by 5 the concatenation of:
6804      "1st vec" and  "1st vec"
6805        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6806      shift right by 3 the concatenation of:
6807      "2nd vec" and  "2nd vec"
6808                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6809                           | New vectors                   |
6810
6811      So that now all vectors are READY:
6812      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6813      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6814      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6815
6816    This algorithm is faster than one in vect_permute_load_chain if:
6817      1.  "shift of a concatination" is faster than general permutation.
6818          This is usually so.
6819      2.  The TARGET machine can't execute vector instructions in parallel.
6820          This is because each step of the algorithm depends on previous.
6821          The algorithm in vect_permute_load_chain is much more parallel.
6822
6823    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6824 */
6825
6826 static bool
6827 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6828                                unsigned int length,
6829                                stmt_vec_info stmt_info,
6830                                gimple_stmt_iterator *gsi,
6831                                vec<tree> *result_chain)
6832 {
6833   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6834   tree perm2_mask1, perm2_mask2, perm3_mask;
6835   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6836   gimple *perm_stmt;
6837
6838   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6839   machine_mode vmode = TYPE_MODE (vectype);
6840   unsigned int i;
6841   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6842
6843   unsigned HOST_WIDE_INT nelt, vf;
6844   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6845       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6846     /* Not supported for variable-length vectors.  */
6847     return false;
6848
6849   vec_perm_builder sel (nelt, nelt, 1);
6850   sel.quick_grow (nelt);
6851
6852   result_chain->quick_grow (length);
6853   memcpy (result_chain->address (), dr_chain.address (),
6854           length * sizeof (tree));
6855
6856   if (pow2p_hwi (length) && vf > 4)
6857     {
6858       unsigned int j, log_length = exact_log2 (length);
6859       for (i = 0; i < nelt / 2; ++i)
6860         sel[i] = i * 2;
6861       for (i = 0; i < nelt / 2; ++i)
6862         sel[nelt / 2 + i] = i * 2 + 1;
6863       vec_perm_indices indices (sel, 2, nelt);
6864       if (!can_vec_perm_const_p (vmode, vmode, indices))
6865         {
6866           if (dump_enabled_p ())
6867             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868                              "shuffle of 2 fields structure is not \
6869                               supported by target\n");
6870           return false;
6871         }
6872       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6873
6874       for (i = 0; i < nelt / 2; ++i)
6875         sel[i] = i * 2 + 1;
6876       for (i = 0; i < nelt / 2; ++i)
6877         sel[nelt / 2 + i] = i * 2;
6878       indices.new_vector (sel, 2, nelt);
6879       if (!can_vec_perm_const_p (vmode, vmode, indices))
6880         {
6881           if (dump_enabled_p ())
6882             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883                              "shuffle of 2 fields structure is not \
6884                               supported by target\n");
6885           return false;
6886         }
6887       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6888
6889       /* Generating permutation constant to shift all elements.
6890          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6891       for (i = 0; i < nelt; i++)
6892         sel[i] = nelt / 2 + i;
6893       indices.new_vector (sel, 2, nelt);
6894       if (!can_vec_perm_const_p (vmode, vmode, indices))
6895         {
6896           if (dump_enabled_p ())
6897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                              "shift permutation is not supported by target\n");
6899           return false;
6900         }
6901       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6902
6903       /* Generating permutation constant to select vector from 2.
6904          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6905       for (i = 0; i < nelt / 2; i++)
6906         sel[i] = i;
6907       for (i = nelt / 2; i < nelt; i++)
6908         sel[i] = nelt + i;
6909       indices.new_vector (sel, 2, nelt);
6910       if (!can_vec_perm_const_p (vmode, vmode, indices))
6911         {
6912           if (dump_enabled_p ())
6913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6914                              "select is not supported by target\n");
6915           return false;
6916         }
6917       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6918
6919       for (i = 0; i < log_length; i++)
6920         {
6921           for (j = 0; j < length; j += 2)
6922             {
6923               first_vect = dr_chain[j];
6924               second_vect = dr_chain[j + 1];
6925
6926               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6927               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6928                                                first_vect, first_vect,
6929                                                perm2_mask1);
6930               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6931               vect[0] = data_ref;
6932
6933               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6934               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6935                                                second_vect, second_vect,
6936                                                perm2_mask2);
6937               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6938               vect[1] = data_ref;
6939
6940               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6941               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6942                                                vect[0], vect[1], shift1_mask);
6943               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6944               (*result_chain)[j/2 + length/2] = data_ref;
6945
6946               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6947               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6948                                                vect[0], vect[1], select_mask);
6949               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6950               (*result_chain)[j/2] = data_ref;
6951             }
6952           memcpy (dr_chain.address (), result_chain->address (),
6953                   length * sizeof (tree));
6954         }
6955       return true;
6956     }
6957   if (length == 3 && vf > 2)
6958     {
6959       unsigned int k = 0, l = 0;
6960
6961       /* Generating permutation constant to get all elements in rigth order.
6962          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6963       for (i = 0; i < nelt; i++)
6964         {
6965           if (3 * k + (l % 3) >= nelt)
6966             {
6967               k = 0;
6968               l += (3 - (nelt % 3));
6969             }
6970           sel[i] = 3 * k + (l % 3);
6971           k++;
6972         }
6973       vec_perm_indices indices (sel, 2, nelt);
6974       if (!can_vec_perm_const_p (vmode, vmode, indices))
6975         {
6976           if (dump_enabled_p ())
6977             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978                              "shuffle of 3 fields structure is not \
6979                               supported by target\n");
6980           return false;
6981         }
6982       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6983
6984       /* Generating permutation constant to shift all elements.
6985          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6986       for (i = 0; i < nelt; i++)
6987         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6988       indices.new_vector (sel, 2, nelt);
6989       if (!can_vec_perm_const_p (vmode, vmode, indices))
6990         {
6991           if (dump_enabled_p ())
6992             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993                              "shift permutation is not supported by target\n");
6994           return false;
6995         }
6996       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6997
6998       /* Generating permutation constant to shift all elements.
6999          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
7000       for (i = 0; i < nelt; i++)
7001         sel[i] = 2 * (nelt / 3) + 1 + i;
7002       indices.new_vector (sel, 2, nelt);
7003       if (!can_vec_perm_const_p (vmode, vmode, indices))
7004         {
7005           if (dump_enabled_p ())
7006             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007                              "shift permutation is not supported by target\n");
7008           return false;
7009         }
7010       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
7011
7012       /* Generating permutation constant to shift all elements.
7013          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
7014       for (i = 0; i < nelt; i++)
7015         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
7016       indices.new_vector (sel, 2, nelt);
7017       if (!can_vec_perm_const_p (vmode, vmode, indices))
7018         {
7019           if (dump_enabled_p ())
7020             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021                              "shift permutation is not supported by target\n");
7022           return false;
7023         }
7024       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
7025
7026       /* Generating permutation constant to shift all elements.
7027          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
7028       for (i = 0; i < nelt; i++)
7029         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
7030       indices.new_vector (sel, 2, nelt);
7031       if (!can_vec_perm_const_p (vmode, vmode, indices))
7032         {
7033           if (dump_enabled_p ())
7034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035                              "shift permutation is not supported by target\n");
7036           return false;
7037         }
7038       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
7039
7040       for (k = 0; k < 3; k++)
7041         {
7042           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
7043           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7044                                            dr_chain[k], dr_chain[k],
7045                                            perm3_mask);
7046           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7047           vect[k] = data_ref;
7048         }
7049
7050       for (k = 0; k < 3; k++)
7051         {
7052           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
7053           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7054                                            vect[k % 3], vect[(k + 1) % 3],
7055                                            shift1_mask);
7056           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7057           vect_shift[k] = data_ref;
7058         }
7059
7060       for (k = 0; k < 3; k++)
7061         {
7062           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
7063           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7064                                            vect_shift[(4 - k) % 3],
7065                                            vect_shift[(3 - k) % 3],
7066                                            shift2_mask);
7067           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7068           vect[k] = data_ref;
7069         }
7070
7071       (*result_chain)[3 - (nelt % 3)] = vect[2];
7072
7073       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
7074       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
7075                                        vect[0], shift3_mask);
7076       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7077       (*result_chain)[nelt % 3] = data_ref;
7078
7079       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
7080       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
7081                                        vect[1], shift4_mask);
7082       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7083       (*result_chain)[0] = data_ref;
7084       return true;
7085     }
7086   return false;
7087 }
7088
7089 /* Function vect_transform_grouped_load.
7090
7091    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
7092    to perform their permutation and ascribe the result vectorized statements to
7093    the scalar statements.
7094 */
7095
7096 void
7097 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
7098                              vec<tree> dr_chain,
7099                              int size, gimple_stmt_iterator *gsi)
7100 {
7101   machine_mode mode;
7102   vec<tree> result_chain = vNULL;
7103
7104   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
7105      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
7106      vectors, that are ready for vector computation.  */
7107   result_chain.create (size);
7108
7109   /* If reassociation width for vector type is 2 or greater target machine can
7110      execute 2 or more vector instructions in parallel.  Otherwise try to
7111      get chain for loads group using vect_shift_permute_load_chain.  */
7112   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
7113   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
7114       || pow2p_hwi (size)
7115       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
7116                                          gsi, &result_chain))
7117     vect_permute_load_chain (vinfo, dr_chain,
7118                              size, stmt_info, gsi, &result_chain);
7119   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
7120   result_chain.release ();
7121 }
7122
7123 /* RESULT_CHAIN contains the output of a group of grouped loads that were
7124    generated as part of the vectorization of STMT_INFO.  Assign the statement
7125    for each vector to the associated scalar statement.  */
7126
7127 void
7128 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
7129                                   vec<tree> result_chain)
7130 {
7131   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7132   unsigned int i, gap_count;
7133   tree tmp_data_ref;
7134
7135   /* Put a permuted data-ref in the VECTORIZED_STMT field.
7136      Since we scan the chain starting from it's first node, their order
7137      corresponds the order of data-refs in RESULT_CHAIN.  */
7138   stmt_vec_info next_stmt_info = first_stmt_info;
7139   gap_count = 1;
7140   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7141     {
7142       if (!next_stmt_info)
7143         break;
7144
7145       /* Skip the gaps.  Loads created for the gaps will be removed by dead
7146        code elimination pass later.  No need to check for the first stmt in
7147        the group, since it always exists.
7148        DR_GROUP_GAP is the number of steps in elements from the previous
7149        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
7150        correspond to the gaps.  */
7151       if (next_stmt_info != first_stmt_info
7152           && gap_count < DR_GROUP_GAP (next_stmt_info))
7153         {
7154           gap_count++;
7155           continue;
7156         }
7157
7158       /* ???  The following needs cleanup after the removal of
7159          DR_GROUP_SAME_DR_STMT.  */
7160       if (next_stmt_info)
7161         {
7162           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7163           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7164              copies, and we put the new vector statement last.  */
7165           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7166
7167           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7168           gap_count = 1;
7169         }
7170     }
7171 }
7172
7173 /* Function vect_force_dr_alignment_p.
7174
7175    Returns whether the alignment of a DECL can be forced to be aligned
7176    on ALIGNMENT bit boundary.  */
7177
7178 bool
7179 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7180 {
7181   if (!VAR_P (decl))
7182     return false;
7183
7184   if (decl_in_symtab_p (decl)
7185       && !symtab_node::get (decl)->can_increase_alignment_p ())
7186     return false;
7187
7188   if (TREE_STATIC (decl))
7189     return (known_le (alignment,
7190                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7191   else
7192     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7193 }
7194
7195 /* Return whether the data reference DR_INFO is supported with respect to its
7196    alignment.
7197    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7198    it is aligned, i.e., check if it is possible to vectorize it with different
7199    alignment.  */
7200
7201 enum dr_alignment_support
7202 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7203                                tree vectype, int misalignment)
7204 {
7205   data_reference *dr = dr_info->dr;
7206   stmt_vec_info stmt_info = dr_info->stmt;
7207   machine_mode mode = TYPE_MODE (vectype);
7208   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7209   class loop *vect_loop = NULL;
7210   bool nested_in_vect_loop = false;
7211
7212   if (misalignment == 0)
7213     return dr_aligned;
7214   else if (dr_info->need_peeling_for_alignment)
7215     return dr_unaligned_unsupported;
7216
7217   /* For now assume all conditional loads/stores support unaligned
7218      access without any special code.  */
7219   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7220     if (gimple_call_internal_p (stmt)
7221         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7222             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7223       return dr_unaligned_supported;
7224
7225   if (loop_vinfo)
7226     {
7227       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7228       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7229     }
7230
7231   /* Possibly unaligned access.  */
7232
7233   /* We can choose between using the implicit realignment scheme (generating
7234      a misaligned_move stmt) and the explicit realignment scheme (generating
7235      aligned loads with a REALIGN_LOAD).  There are two variants to the
7236      explicit realignment scheme: optimized, and unoptimized.
7237      We can optimize the realignment only if the step between consecutive
7238      vector loads is equal to the vector size.  Since the vector memory
7239      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7240      is guaranteed that the misalignment amount remains the same throughout the
7241      execution of the vectorized loop.  Therefore, we can create the
7242      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7243      at the loop preheader.
7244
7245      However, in the case of outer-loop vectorization, when vectorizing a
7246      memory access in the inner-loop nested within the LOOP that is now being
7247      vectorized, while it is guaranteed that the misalignment of the
7248      vectorized memory access will remain the same in different outer-loop
7249      iterations, it is *not* guaranteed that is will remain the same throughout
7250      the execution of the inner-loop.  This is because the inner-loop advances
7251      with the original scalar step (and not in steps of VS).  If the inner-loop
7252      step happens to be a multiple of VS, then the misalignment remains fixed
7253      and we can use the optimized realignment scheme.  For example:
7254
7255       for (i=0; i<N; i++)
7256         for (j=0; j<M; j++)
7257           s += a[i+j];
7258
7259      When vectorizing the i-loop in the above example, the step between
7260      consecutive vector loads is 1, and so the misalignment does not remain
7261      fixed across the execution of the inner-loop, and the realignment cannot
7262      be optimized (as illustrated in the following pseudo vectorized loop):
7263
7264       for (i=0; i<N; i+=4)
7265         for (j=0; j<M; j++){
7266           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7267                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7268                          // (assuming that we start from an aligned address).
7269           }
7270
7271      We therefore have to use the unoptimized realignment scheme:
7272
7273       for (i=0; i<N; i+=4)
7274           for (j=k; j<M; j+=4)
7275           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7276                            // that the misalignment of the initial address is
7277                            // 0).
7278
7279      The loop can then be vectorized as follows:
7280
7281       for (k=0; k<4; k++){
7282         rt = get_realignment_token (&vp[k]);
7283         for (i=0; i<N; i+=4){
7284           v1 = vp[i+k];
7285           for (j=k; j<M; j+=4){
7286             v2 = vp[i+j+VS-1];
7287             va = REALIGN_LOAD <v1,v2,rt>;
7288             vs += va;
7289             v1 = v2;
7290           }
7291         }
7292     } */
7293
7294   if (DR_IS_READ (dr))
7295     {
7296       if (can_implement_p (vec_realign_load_optab, mode)
7297           && (!targetm.vectorize.builtin_mask_for_load
7298               || targetm.vectorize.builtin_mask_for_load ()))
7299         {
7300           /* If we are doing SLP then the accesses need not have the
7301              same alignment, instead it depends on the SLP group size.  */
7302           if (loop_vinfo
7303               && STMT_SLP_TYPE (stmt_info)
7304               && STMT_VINFO_GROUPED_ACCESS (stmt_info)
7305               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7306                               * (DR_GROUP_SIZE
7307                                    (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7308                               TYPE_VECTOR_SUBPARTS (vectype)))
7309             ;
7310           else if (!loop_vinfo
7311                    || (nested_in_vect_loop
7312                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7313                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7314             return dr_explicit_realign;
7315           else
7316             return dr_explicit_realign_optimized;
7317         }
7318     }
7319
7320   bool is_packed = false;
7321   tree type = TREE_TYPE (DR_REF (dr));
7322   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7323     is_packed = not_size_aligned (DR_REF (dr));
7324   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7325                                                      is_packed))
7326     return dr_unaligned_supported;
7327
7328   /* Unsupported.  */
7329   return dr_unaligned_unsupported;
7330 }