gcc/gimple-loop-versioning.cc

   1 /* Loop versioning pass.
   2    Copyright (C) 2018-2024 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #define INCLUDE_MEMORY
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "tree.h"
  26 #include "gimple.h"
  27 #include "gimple-iterator.h"
  28 #include "tree-pass.h"
  29 #include "gimplify-me.h"
  30 #include "cfgloop.h"
  31 #include "tree-ssa-loop.h"
  32 #include "ssa.h"
  33 #include "tree-scalar-evolution.h"
  34 #include "tree-ssa-loop-ivopts.h"
  35 #include "fold-const.h"
  36 #include "tree-ssa-propagate.h"
  37 #include "tree-inline.h"
  38 #include "domwalk.h"
  39 #include "tree-vectorizer.h"
  40 #include "omp-general.h"
  41 #include "predict.h"
  42 #include "tree-into-ssa.h"
  43 #include "gimple-range.h"
  44 #include "tree-cfg.h"
  45
  46 namespace {
  47
  48 /* This pass looks for loops that could be simplified if certain loop
  49    invariant conditions were true.  It is effectively a form of loop
  50    splitting in which the pass produces the split conditions itself,
  51    instead of using ones that are already present in the IL.
  52
  53    Versioning for when strides are 1
  54    ---------------------------------
  55
  56    At the moment the only thing the pass looks for are memory references
  57    like:
  58
  59      for (auto i : ...)
  60        ...x[i * stride]...
  61
  62    It considers changing such loops to:
  63
  64      if (stride == 1)
  65        for (auto i : ...)    [A]
  66          ...x[i]...
  67      else
  68        for (auto i : ...)    [B]
  69          ...x[i * stride]...
  70
  71    This can have several benefits:
  72
  73    (1) [A] is often easier or cheaper to vectorize than [B].
  74
  75    (2) The scalar code in [A] is simpler than the scalar code in [B]
  76        (if the loops cannot be vectorized or need an epilogue loop).
  77
  78    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
  79
  80    (4) [A] has simpler address evolutions, which can help other passes
  81        like loop interchange.
  82
  83    The optimization is particularly useful for assumed-shape arrays in
  84    Fortran, where the stride of the innermost dimension depends on the
  85    array descriptor but is often equal to 1 in practice.  For example:
  86
  87      subroutine f1(x)
  88        real :: x(:)
  89        x(:) = 100
  90      end subroutine f1
  91
  92    generates the equivalent of:
  93
  94      raw_stride = *x.dim[0].stride;
  95      stride = raw_stride != 0 ? raw_stride : 1;
  96      x_base = *x.data;
  97      ...
  98      tmp1 = stride * S;
  99      tmp2 = tmp1 - stride;
 100      *x_base[tmp2] = 1.0e+2;
 101
 102    but in the common case that stride == 1, the last three statements
 103    simplify to:
 104
 105      tmp3 = S + -1;
 106      *x_base[tmp3] = 1.0e+2;
 107
 108    The optimization is in principle very simple.  The difficult parts are:
 109
 110    (a) deciding which parts of a general address calculation correspond
 111        to the inner dimension of an array, since this usually isn't explicit
 112        in the IL, and for C often isn't even explicit in the source code
 113
 114    (b) estimating when the transformation is worthwhile
 115
 116    Structure
 117    ---------
 118
 119    The pass has four phases:
 120
 121    (1) Walk through the statements looking for and recording potential
 122        versioning opportunities.  Stop if there are none.
 123
 124    (2) Use context-sensitive range information to see whether any versioning
 125        conditions are impossible in practice.  Remove them if so, and stop
 126        if no opportunities remain.
 127
 128        (We do this only after (1) to keep compile time down when no
 129        versioning opportunities exist.)
 130
 131    (3) Apply the cost model.  Decide which versioning opportunities are
 132        worthwhile and at which nesting level they should be applied.
 133
 134    (4) Attempt to version all the loops selected by (3), so that:
 135
 136          for (...)
 137            ...
 138
 139        becomes:
 140
 141          if (!cond)
 142            for (...) // Original loop
 143              ...
 144          else
 145            for (...) // New loop
 146              ...
 147
 148        Use the version condition COND to simplify the new loop.  */
 149
 150 /* Enumerates the likelihood that a particular value indexes the inner
 151    dimension of an array.  */
 152 enum inner_likelihood {
 153   INNER_UNLIKELY,
 154   INNER_DONT_KNOW,
 155   INNER_LIKELY
 156 };
 157
 158 /* Information about one term of an address_info.  */
 159 struct address_term_info
 160 {
 161   /* The value of the term is EXPR * MULTIPLIER.  */
 162   tree expr;
 163   unsigned HOST_WIDE_INT multiplier;
 164
 165   /* The stride applied by EXPR in each iteration of some unrecorded loop,
 166      or null if no stride has been identified.  */
 167   tree stride;
 168
 169   /* Enumerates the likelihood that EXPR indexes the inner dimension
 170      of an array.  */
 171   enum inner_likelihood inner_likelihood;
 172
 173   /* True if STRIDE == 1 is a versioning opportunity when considered
 174      in isolation.  */
 175   bool versioning_opportunity_p;
 176 };
 177
 178 /* Information about an address calculation, and the range of constant
 179    offsets applied to it.  */
 180 class address_info
 181 {
 182 public:
 183   static const unsigned int MAX_TERMS = 8;
 184
 185   /* One statement that calculates the address.  If multiple statements
 186      share the same address, we only record the first.  */
 187   gimple *stmt;
 188
 189   /* The loop containing STMT (cached for convenience).  If multiple
 190      statements share the same address, they all belong to this loop.  */
 191   class loop *loop;
 192
 193   /* A decomposition of the calculation into a sum of terms plus an
 194      optional base.  When BASE is provided, it is never an SSA name.
 195      Once initialization is complete, all members of TERMs are SSA names.  */
 196   tree base;
 197   auto_vec<address_term_info, MAX_TERMS> terms;
 198
 199   /* All bytes accessed from the address fall in the offset range
 200      [MIN_OFFSET, MAX_OFFSET).  */
 201   HOST_WIDE_INT min_offset, max_offset;
 202 };
 203
 204 /* Stores addresses based on their base and terms (ignoring the offsets).  */
 205 struct address_info_hasher : nofree_ptr_hash <address_info>
 206 {
 207   static hashval_t hash (const address_info *);
 208   static bool equal (const address_info *, const address_info *);
 209 };
 210
 211 /* Information about the versioning we'd like to apply to a loop.  */
 212 class loop_info
 213 {
 214 public:
 215   bool worth_versioning_p () const;
 216
 217   /* True if we've decided not to version this loop.  The remaining
 218      fields are meaningless if so.  */
 219   bool rejected_p;
 220
 221   /* True if at least one subloop of this loop benefits from versioning.  */
 222   bool subloops_benefit_p;
 223
 224   /* An estimate of the total number of instructions in the loop,
 225      excluding those in subloops that benefit from versioning.  */
 226   unsigned int num_insns;
 227
 228   /* The outermost loop that can handle all the version checks
 229      described below.  */
 230   class loop *outermost;
 231
 232   /* The first entry in the list of blocks that belong to this loop
 233      (and not to subloops).  m_next_block_in_loop provides the chain
 234      pointers for the list.  */
 235   basic_block block_list;
 236
 237   /* We'd like to version the loop for the case in which these SSA names
 238      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
 239   bitmap_head unity_names;
 240
 241   /* If versioning succeeds, this points the version of the loop that
 242      assumes the version conditions holds.  */
 243   class loop *optimized_loop;
 244 };
 245
 246 /* The main pass structure.  */
 247 class loop_versioning
 248 {
 249 public:
 250   loop_versioning (function *);
 251   ~loop_versioning ();
 252   unsigned int run ();
 253
 254 private:
 255   /* Used to walk the dominator tree to find loop versioning conditions
 256      that are always false.  */
 257   class lv_dom_walker : public dom_walker
 258   {
 259   public:
 260     lv_dom_walker (loop_versioning &);
 261
 262     edge before_dom_children (basic_block) final override;
 263
 264   private:
 265     /* The parent pass.  */
 266     loop_versioning &m_lv;
 267   };
 268
 269   /* Used to simplify statements based on conditions that are established
 270      by the version checks.  */
 271   class name_prop : public substitute_and_fold_engine
 272   {
 273   public:
 274     name_prop (loop_info &li) : m_li (li) {}
 275     tree value_of_expr (tree name, gimple *) final override;
 276
 277   private:
 278     /* Information about the versioning we've performed on the loop.  */
 279     loop_info &m_li;
 280   };
 281
 282   loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
 283
 284   unsigned int max_insns_for_loop (class loop *);
 285   bool expensive_stmt_p (gimple *);
 286
 287   void version_for_unity (gimple *, tree);
 288   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
 289                                 unsigned HOST_WIDE_INT * = 0);
 290   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
 291   bool multiply_term_by (address_term_info &, tree);
 292   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
 293   void dump_inner_likelihood (address_info &, address_term_info &);
 294   void analyze_stride (address_info &, address_term_info &,
 295                        tree, class loop *);
 296   bool find_per_loop_multiplication (address_info &, address_term_info &);
 297   bool analyze_term_using_scevs (address_info &, address_term_info &);
 298   void analyze_arbitrary_term (address_info &, address_term_info &);
 299   void analyze_address_fragment (address_info &);
 300   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
 301                                 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
 302   void analyze_expr (gimple *, tree);
 303   bool analyze_block (basic_block);
 304   bool analyze_blocks ();
 305
 306   void prune_loop_conditions (class loop *);
 307   bool prune_conditions ();
 308
 309   void merge_loop_info (class loop *, class loop *);
 310   void add_loop_to_queue (class loop *);
 311   bool decide_whether_loop_is_versionable (class loop *);
 312   bool make_versioning_decisions ();
 313
 314   bool version_loop (class loop *);
 315   void implement_versioning_decisions ();
 316
 317   /* The function we're optimizing.  */
 318   function *m_fn;
 319
 320   /* The obstack to use for all pass-specific bitmaps.  */
 321   bitmap_obstack m_bitmap_obstack;
 322
 323   /* An obstack to use for general allocation.  */
 324   obstack m_obstack;
 325
 326   /* The total number of loop version conditions we've found.  */
 327   unsigned int m_num_conditions;
 328
 329   /* Assume that an address fragment of the form i * stride * scale
 330      (for variable stride and constant scale) will not benefit from
 331      versioning for stride == 1 when scale is greater than this value.  */
 332   unsigned HOST_WIDE_INT m_maximum_scale;
 333
 334   /* Information about each loop.  */
 335   auto_vec<loop_info> m_loops;
 336
 337   /* Used to form a linked list of blocks that belong to a loop,
 338      started by loop_info::block_list.  */
 339   auto_vec<basic_block> m_next_block_in_loop;
 340
 341   /* The list of loops that we've decided to version.  */
 342   auto_vec<class loop *> m_loops_to_version;
 343
 344   /* A table of addresses in the current loop, keyed off their values
 345      but not their offsets.  */
 346   hash_table <address_info_hasher> m_address_table;
 347
 348   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
 349   auto_vec <address_info *, 32> m_address_list;
 350 };
 351
 352 /* If EXPR is an SSA name and not a default definition, return the
 353    defining statement, otherwise return null.  */
 354
 355 static gimple *
 356 maybe_get_stmt (tree expr)
 357 {
 358   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
 359     return SSA_NAME_DEF_STMT (expr);
 360   return NULL;
 361 }
 362
 363 /* Like maybe_get_stmt, but also return null if the defining
 364    statement isn't an assignment.  */
 365
 366 static gassign *
 367 maybe_get_assign (tree expr)
 368 {
 369   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
 370 }
 371
 372 /* Return true if this pass should look through a cast of expression FROM
 373    to type TYPE when analyzing pieces of an address.  */
 374
 375 static bool
 376 look_through_cast_p (tree type, tree from)
 377 {
 378   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
 379           && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
 380 }
 381
 382 /* Strip all conversions of integers or pointers from EXPR, regardless
 383    of whether the conversions are nops.  This is useful in the context
 384    of this pass because we're not trying to fold or simulate the
 385    expression; we just want to see how it's structured.  */
 386
 387 static tree
 388 strip_casts (tree expr)
 389 {
 390   const unsigned int MAX_NITERS = 4;
 391
 392   tree type = TREE_TYPE (expr);
 393   while (CONVERT_EXPR_P (expr)
 394          && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
 395     expr = TREE_OPERAND (expr, 0);
 396
 397   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
 398     {
 399       gassign *assign = maybe_get_assign (expr);
 400       if (assign
 401           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
 402           && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
 403         expr = gimple_assign_rhs1 (assign);
 404       else
 405         break;
 406     }
 407   return expr;
 408 }
 409
 410 /* Compare two address_term_infos in the same address_info.  */
 411
 412 static int
 413 compare_address_terms (const void *a_uncast, const void *b_uncast)
 414 {
 415   const address_term_info *a = (const address_term_info *) a_uncast;
 416   const address_term_info *b = (const address_term_info *) b_uncast;
 417
 418   if (a->expr != b->expr)
 419     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
 420
 421   if (a->multiplier != b->multiplier)
 422     return a->multiplier < b->multiplier ? -1 : 1;
 423
 424   return 0;
 425 }
 426
 427 /* Dump ADDRESS using flags FLAGS.  */
 428
 429 static void
 430 dump_address_info (dump_flags_t flags, address_info &address)
 431 {
 432   if (address.base)
 433     dump_printf (flags, "%T + ", address.base);
 434   for (unsigned int i = 0; i < address.terms.length (); ++i)
 435     {
 436       if (i != 0)
 437         dump_printf (flags, " + ");
 438       dump_printf (flags, "%T", address.terms[i].expr);
 439       if (address.terms[i].multiplier != 1)
 440         dump_printf (flags, " * %wd", address.terms[i].multiplier);
 441     }
 442   dump_printf (flags, " + [%wd, %wd]",
 443                address.min_offset, address.max_offset - 1);
 444 }
 445
 446 /* Hash an address_info based on its base and terms.  */
 447
 448 hashval_t
 449 address_info_hasher::hash (const address_info *info)
 450 {
 451   inchash::hash hash;
 452   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
 453   hash.add_int (info->terms.length ());
 454   for (unsigned int i = 0; i < info->terms.length (); ++i)
 455     {
 456       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
 457       hash.add_hwi (info->terms[i].multiplier);
 458     }
 459   return hash.end ();
 460 }
 461
 462 /* Return true if two address_infos have equal bases and terms.  Other
 463    properties might be different (such as the statement or constant
 464    offset range).  */
 465
 466 bool
 467 address_info_hasher::equal (const address_info *a, const address_info *b)
 468 {
 469   if (a->base != b->base
 470       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
 471     return false;
 472
 473   if (a->terms.length () != b->terms.length ())
 474     return false;
 475
 476   for (unsigned int i = 0; i < a->terms.length (); ++i)
 477     if (a->terms[i].expr != b->terms[i].expr
 478         || a->terms[i].multiplier != b->terms[i].multiplier)
 479       return false;
 480
 481   return true;
 482 }
 483
 484 /* Return true if we want to version the loop, i.e. if we have a
 485    specific reason for doing so and no specific reason not to.  */
 486
 487 bool
 488 loop_info::worth_versioning_p () const
 489 {
 490   return (!rejected_p
 491           && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
 492 }
 493
 494 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
 495   : dom_walker (CDI_DOMINATORS), m_lv (lv)
 496 {
 497 }
 498
 499 /* Process BB before processing the blocks it dominates.  */
 500
 501 edge
 502 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 503 {
 504   if (bb == bb->loop_father->header)
 505     m_lv.prune_loop_conditions (bb->loop_father);
 506
 507   return NULL;
 508 }
 509
 510 /* Decide whether to replace VAL with a new value in a versioned loop.
 511    Return the new value if so, otherwise return null.  */
 512
 513 tree
 514 loop_versioning::name_prop::value_of_expr (tree val, gimple *)
 515 {
 516   if (TREE_CODE (val) == SSA_NAME
 517       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
 518     return build_one_cst (TREE_TYPE (val));
 519   return NULL_TREE;
 520 }
 521
 522 /* Initialize the structure to optimize FN.  */
 523
 524 loop_versioning::loop_versioning (function *fn)
 525   : m_fn (fn),
 526     m_num_conditions (0),
 527     m_address_table (31)
 528 {
 529   unsigned m_nloops = number_of_loops (fn);
 530   bitmap_obstack_initialize (&m_bitmap_obstack);
 531   gcc_obstack_init (&m_obstack);
 532
 533   /* Initialize the loop information.  */
 534   m_loops.safe_grow_cleared (m_nloops, true);
 535   for (unsigned int i = 0; i < m_nloops; ++i)
 536     {
 537       m_loops[i].outermost = get_loop (m_fn, 0);
 538       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
 539     }
 540
 541   /* Initialize the list of blocks that belong to each loop.  */
 542   unsigned int nbbs = last_basic_block_for_fn (fn);
 543   m_next_block_in_loop.safe_grow (nbbs, true);
 544   basic_block bb;
 545   FOR_EACH_BB_FN (bb, fn)
 546     {
 547       loop_info &li = get_loop_info (bb->loop_father);
 548       m_next_block_in_loop[bb->index] = li.block_list;
 549       li.block_list = bb;
 550     }
 551
 552   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
 553      unvectorizable code, since it is the largest size that can be
 554      handled efficiently by scalar code.  omp_max_vf calculates the
 555      maximum number of bytes in a vector, when such a value is relevant
 556      to loop optimization.  */
 557   m_maximum_scale = estimated_poly_value (omp_max_vf ());
 558   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 559 }
 560
 561 loop_versioning::~loop_versioning ()
 562 {
 563   bitmap_obstack_release (&m_bitmap_obstack);
 564   obstack_free (&m_obstack, NULL);
 565 }
 566
 567 /* Return the maximum number of instructions allowed in LOOP before
 568    it becomes too big for versioning.
 569
 570    There are separate limits for inner and outer loops.  The limit for
 571    inner loops applies only to loops that benefit directly from versioning.
 572    The limit for outer loops applies to all code in the outer loop and
 573    its subloops that *doesn't* benefit directly from versioning; such code
 574    would be "taken along for the ride".  The idea is that if the cost of
 575    the latter is small, it is better to version outer loops rather than
 576    inner loops, both to reduce the number of repeated checks and to enable
 577    more of the loop nest to be optimized as a natural nest (e.g. by loop
 578    interchange or outer-loop vectorization).  */
 579
 580 unsigned int
 581 loop_versioning::max_insns_for_loop (class loop *loop)
 582 {
 583   return (loop->inner
 584           ? param_loop_versioning_max_outer_insns
 585           : param_loop_versioning_max_inner_insns);
 586 }
 587
 588 /* Return true if for cost reasons we should avoid versioning any loop
 589    that contains STMT.
 590
 591    Note that we don't need to check whether versioning is invalid for
 592    correctness reasons, since the versioning process does that for us.
 593    The conditions involved are too rare to be worth duplicating here.  */
 594
 595 bool
 596 loop_versioning::expensive_stmt_p (gimple *stmt)
 597 {
 598   if (gcall *call = dyn_cast <gcall *> (stmt))
 599     /* Assume for now that the time spent in an "expensive" call would
 600        overwhelm any saving from versioning.  */
 601     return !gimple_inexpensive_call_p (call);
 602   return false;
 603 }
 604
 605 /* Record that we want to version the loop that contains STMT for the
 606    case in which SSA name NAME is equal to 1.  We already know that NAME
 607    is invariant in the loop.  */
 608
 609 void
 610 loop_versioning::version_for_unity (gimple *stmt, tree name)
 611 {
 612   class loop *loop = loop_containing_stmt (stmt);
 613   loop_info &li = get_loop_info (loop);
 614
 615   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
 616     {
 617       /* This is the first time we've wanted to version LOOP for NAME.
 618          Keep track of the outermost loop that can handle all versioning
 619          checks in LI.  */
 620       class loop *outermost
 621         = outermost_invariant_loop_for_expr (loop, name);
 622       if (loop_depth (li.outermost) < loop_depth (outermost))
 623         li.outermost = outermost;
 624
 625       if (dump_enabled_p ())
 626         {
 627           dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
 628                            " for when %T == 1", name);
 629           if (outermost == loop)
 630             dump_printf (MSG_NOTE, "; cannot hoist check further");
 631           else
 632             {
 633               dump_printf (MSG_NOTE, "; could implement the check at loop"
 634                            " depth %d", loop_depth (outermost));
 635               if (loop_depth (li.outermost) > loop_depth (outermost))
 636                 dump_printf (MSG_NOTE, ", but other checks only allow"
 637                              " a depth of %d", loop_depth (li.outermost));
 638             }
 639           dump_printf (MSG_NOTE, "\n");
 640         }
 641
 642       m_num_conditions += 1;
 643     }
 644   else
 645     {
 646       /* This is a duplicate request.  */
 647       if (dump_enabled_p ())
 648         dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
 649                          " loop for when %T == 1\n", name);
 650     }
 651 }
 652
 653 /* Return true if OP1_TREE is constant and if in principle it is worth
 654    versioning an address fragment of the form:
 655
 656      i * OP1_TREE * OP2 * stride
 657
 658    for the case in which stride == 1.  This in practice means testing
 659    whether:
 660
 661      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
 662
 663    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
 664
 665 bool
 666 loop_versioning::acceptable_multiplier_p (tree op1_tree,
 667                                           unsigned HOST_WIDE_INT op2,
 668                                           unsigned HOST_WIDE_INT *result)
 669 {
 670   if (tree_fits_uhwi_p (op1_tree))
 671     {
 672       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
 673       /* The first part checks for overflow.  */
 674       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
 675         {
 676           if (result)
 677             *result = op1 * op2;
 678           return true;
 679         }
 680     }
 681   return false;
 682 }
 683
 684 /* Return true if it is worth using loop versioning on a memory access
 685    of type TYPE.  Store the size of the access in *SIZE if so.  */
 686
 687 bool
 688 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
 689 {
 690   return (TYPE_SIZE_UNIT (type)
 691           && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
 692 }
 693
 694 /* See whether OP is constant and whether we can multiply TERM by that
 695    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
 696    TERM if so.  */
 697
 698 bool
 699 loop_versioning::multiply_term_by (address_term_info &term, tree op)
 700 {
 701   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
 702 }
 703
 704 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
 705    is likely to be indexing an innermost dimension, returning the result
 706    as an INNER_* probability.  */
 707
 708 inner_likelihood
 709 loop_versioning::get_inner_likelihood (tree stride,
 710                                        unsigned HOST_WIDE_INT multiplier)
 711 {
 712   const unsigned int MAX_NITERS = 8;
 713
 714   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
 715      least one of those values is likely to be for the innermost dimension.
 716      Record in UNLIKELY_P if at least one of those values is unlikely to be
 717      for the innermost dimension.
 718
 719      E.g. for:
 720
 721        stride = cond ? a * b : 1
 722
 723      we should treat STRIDE as being a likely inner dimension, since
 724      we know that it is 1 under at least some circumstances.  (See the
 725      Fortran example below.)  However:
 726
 727        stride = a * b
 728
 729      on its own is unlikely to be for the innermost dimension, since
 730      that would require both a and b to be 1 at runtime.  */
 731   bool unlikely_p = false;
 732   tree worklist[MAX_NITERS];
 733   unsigned int length = 0;
 734   worklist[length++] = stride;
 735   for (unsigned int i = 0; i < length; ++i)
 736     {
 737       tree expr = worklist[i];
 738
 739       if (CONSTANT_CLASS_P (expr))
 740         {
 741           /* See if EXPR * MULTIPLIER would be consistent with an individual
 742              access or a small grouped access.  */
 743           if (acceptable_multiplier_p (expr, multiplier))
 744             return INNER_LIKELY;
 745           else
 746             unlikely_p = true;
 747         }
 748       else if (gimple *stmt = maybe_get_stmt (expr))
 749         {
 750           /* If EXPR is set by a PHI node, queue its arguments in case
 751              we find one that is consistent with an inner dimension.
 752
 753              An important instance of this is the Fortran handling of array
 754              descriptors, which calculates the stride of the inner dimension
 755              using a PHI equivalent of:
 756
 757                 raw_stride = a.dim[0].stride;
 758                 stride = raw_stride != 0 ? raw_stride : 1;
 759
 760              (Strides for outer dimensions do not treat 0 specially.)  */
 761           if (gphi *phi = dyn_cast <gphi *> (stmt))
 762             {
 763               unsigned int nargs = gimple_phi_num_args (phi);
 764               for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
 765                 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
 766             }
 767           /* If the value is set by an assignment, expect it to be read
 768              from memory (such as an array descriptor) rather than be
 769              calculated.  */
 770           else if (gassign *assign = dyn_cast <gassign *> (stmt))
 771             {
 772               if (!gimple_assign_load_p (assign))
 773                 unlikely_p = true;
 774             }
 775           /* Things like calls don't really tell us anything.  */
 776         }
 777     }
 778
 779   /* We didn't find any possible values of STRIDE that were likely to be
 780      for the innermost dimension.  If we found one that was actively
 781      unlikely to be for the innermost dimension, assume that that applies
 782      to STRIDE too.  */
 783   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
 784 }
 785
 786 /* Dump the likelihood that TERM's stride is for the innermost dimension.
 787    ADDRESS is the address that contains TERM.  */
 788
 789 void
 790 loop_versioning::dump_inner_likelihood (address_info &address,
 791                                         address_term_info &term)
 792 {
 793   if (term.inner_likelihood == INNER_LIKELY)
 794     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
 795                      " innermost dimension\n", term.stride);
 796   else if (term.inner_likelihood == INNER_UNLIKELY)
 797     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
 798                      " innermost dimension\n", term.stride);
 799   else
 800     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
 801                      " is the innermost dimension\n", term.stride);
 802 }
 803
 804 /* The caller has identified that STRIDE is the stride of interest
 805    in TERM, and that the stride is applied in OP_LOOP.  Record this
 806    information in TERM, deciding whether STRIDE is likely to be for
 807    the innermost dimension of an array and whether it represents a
 808    versioning opportunity.  ADDRESS is the address that contains TERM.  */
 809
 810 void
 811 loop_versioning::analyze_stride (address_info &address,
 812                                  address_term_info &term,
 813                                  tree stride, class loop *op_loop)
 814 {
 815   term.stride = stride;
 816
 817   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
 818   if (dump_enabled_p ())
 819     dump_inner_likelihood (address, term);
 820
 821   /* To be a versioning opportunity we require:
 822
 823      - The multiplier applied by TERM is equal to the access size,
 824        so that when STRIDE is 1, the accesses in successive loop
 825        iterations are consecutive.
 826
 827        This is deliberately conservative.  We could relax it to handle
 828        other cases (such as those with gaps between iterations) if we
 829        find any real testcases for which it's useful.
 830
 831      - the stride is applied in the same loop as STMT rather than
 832        in an outer loop.  Although versioning for strides applied in
 833        outer loops could help in some cases -- such as enabling
 834        more loop interchange -- the savings are much lower than for
 835        inner loops.
 836
 837      - the stride is an SSA name that is invariant in STMT's loop,
 838        since otherwise versioning isn't possible.  */
 839   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
 840   if (term.multiplier == access_size
 841       && address.loop == op_loop
 842       && TREE_CODE (stride) == SSA_NAME
 843       && expr_invariant_in_loop_p (address.loop, stride))
 844     {
 845       term.versioning_opportunity_p = true;
 846       if (dump_enabled_p ())
 847         dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
 848                          " opportunity\n", stride);
 849     }
 850 }
 851
 852 /* See whether address term TERM (which belongs to ADDRESS) is the result
 853    of multiplying a varying SSA name by a loop-invariant SSA name.
 854    Return true and update TERM if so.
 855
 856    This handles both cases that SCEV might handle, such as:
 857
 858      for (int i = 0; i < n; ++i)
 859        res += a[i * stride];
 860
 861    and ones in which the term varies arbitrarily between iterations, such as:
 862
 863      for (int i = 0; i < n; ++i)
 864        res += a[index[i] * stride];  */
 865
 866 bool
 867 loop_versioning::find_per_loop_multiplication (address_info &address,
 868                                                address_term_info &term)
 869 {
 870   gassign *mult = maybe_get_assign (term.expr);
 871   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
 872     return false;
 873
 874   class loop *mult_loop = loop_containing_stmt (mult);
 875   if (!loop_outer (mult_loop))
 876     return false;
 877
 878   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
 879   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
 880   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
 881     return false;
 882
 883   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
 884   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
 885   if (invariant1_p == invariant2_p)
 886     return false;
 887
 888   /* Make sure that the loop invariant is OP2 rather than OP1.  */
 889   if (invariant1_p)
 890     std::swap (op1, op2);
 891
 892   if (dump_enabled_p ())
 893     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
 894                      " * loop-invariant %T\n", term.expr, op1, op2);
 895   analyze_stride (address, term, op2, mult_loop);
 896   return true;
 897 }
 898
 899 /* Try to use scalar evolutions to find an address stride for TERM,
 900    which belongs to ADDRESS.  Return true and update TERM if so.
 901
 902    Here we are interested in any evolution information we can find,
 903    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
 904    an outer loop obviously iterates over the inner dimension of an array,
 905    that information can help us eliminate worthless versioning opportunities
 906    in inner loops.  */
 907
 908 bool
 909 loop_versioning::analyze_term_using_scevs (address_info &address,
 910                                            address_term_info &term)
 911 {
 912   gimple *setter = maybe_get_stmt (term.expr);
 913   if (!setter)
 914     return false;
 915
 916   class loop *wrt_loop = loop_containing_stmt (setter);
 917   if (!loop_outer (wrt_loop))
 918     return false;
 919
 920   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
 921   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
 922     {
 923       if (dump_enabled_p ())
 924         dump_printf_loc (MSG_NOTE, address.stmt,
 925                          "address term %T = %T\n", term.expr, chrec);
 926
 927       /* Peel casts and accumulate constant multiplications, up to the
 928          limit allowed by M_MAXIMUM_SCALE.  */
 929       tree stride = strip_casts (CHREC_RIGHT (chrec));
 930       while (TREE_CODE (stride) == MULT_EXPR
 931              && multiply_term_by (term, TREE_OPERAND (stride, 1)))
 932         stride = strip_casts (TREE_OPERAND (stride, 0));
 933
 934       gassign *assign;
 935       while ((assign = maybe_get_assign (stride))
 936              && gimple_assign_rhs_code (assign) == MULT_EXPR
 937              && multiply_term_by (term, gimple_assign_rhs2 (assign)))
 938         {
 939           if (dump_enabled_p ())
 940             dump_printf_loc (MSG_NOTE, address.stmt,
 941                              "looking through %G", (gimple *) assign);
 942           stride = strip_casts (gimple_assign_rhs1 (assign));
 943         }
 944
 945       analyze_stride (address, term, stride, get_chrec_loop (chrec));
 946       return true;
 947     }
 948
 949   return false;
 950 }
 951
 952 /* Address term TERM is an arbitrary term that provides no versioning
 953    opportunities.  Analyze it to see whether it contains any likely
 954    inner strides, so that we don't mistakenly version for other
 955    (less likely) candidates.
 956
 957    This copes with invariant innermost indices such as:
 958
 959      x(i, :) = 100
 960
 961    where the "i" component of the address is invariant in the loop
 962    but provides the real inner stride.
 963
 964    ADDRESS is the address that contains TERM.  */
 965
 966 void
 967 loop_versioning::analyze_arbitrary_term (address_info &address,
 968                                          address_term_info &term)
 969
 970 {
 971   /* A multiplication offers two potential strides.  Pick the one that
 972      is most likely to be an innermost stride.  */
 973   tree expr = term.expr, alt = NULL_TREE;
 974   gassign *mult = maybe_get_assign (expr);
 975   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
 976     {
 977       expr = strip_casts (gimple_assign_rhs1 (mult));
 978       alt = strip_casts (gimple_assign_rhs2 (mult));
 979     }
 980   term.stride = expr;
 981   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
 982   if (alt)
 983     {
 984       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
 985       if (alt_l > term.inner_likelihood)
 986         {
 987           term.stride = alt;
 988           term.inner_likelihood = alt_l;
 989         }
 990     }
 991   if (dump_enabled_p ())
 992     dump_inner_likelihood (address, term);
 993 }
 994
 995 /* Try to identify loop strides in ADDRESS and try to choose realistic
 996    versioning opportunities based on these strides.
 997
 998    The main difficulty here isn't finding strides that could be used
 999    in a version check (that's pretty easy).  The problem instead is to
1000    avoid versioning for some stride S that is unlikely ever to be 1 at
1001    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1002    code bloat, while adding S == 1 to more realistic version conditions
1003    would lose the optimisation opportunity offered by those other conditions.
1004
1005    For example, versioning for a stride of 1 in the Fortran code:
1006
1007      integer :: a(:,:)
1008      a(1,:) = 1
1009
1010    is not usually a good idea, since the assignment is iterating over
1011    an outer dimension and is relatively unlikely to have a stride of 1.
1012    (It isn't impossible, since the inner dimension might be 1, or the
1013    array might be transposed.)  Similarly, in:
1014
1015      integer :: a(:,:), b(:,:)
1016      b(:,1) = a(1,:)
1017
1018    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1019    Versioning for when both strides are 1 would lose most of the benefit
1020    of versioning for b's access.
1021
1022    The approach we take is as follows:
1023
1024    - Analyze each term to see whether it has an identifiable stride,
1025      regardless of which loop applies the stride.
1026
1027    - Evaluate the likelihood that each such stride is for the innermost
1028      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1029
1030    - If there is a single "likely" innermost stride, and that stride is
1031      applied in the loop that contains STMT, version the loop for when the
1032      stride is 1.  This deals with the cases in which we're fairly
1033      confident of doing the right thing, such as the b(:,1) reference above.
1034
1035    - If there are no "likely" innermost strides, and the loop that contains
1036      STMT uses a stride that we rated as "don't know", version for when
1037      that stride is 1.  This is principally used for C code such as:
1038
1039        for (int i = 0; i < n; ++i)
1040          a[i * x] = ...;
1041
1042      and:
1043
1044        for (int j = 0; j < n; ++j)
1045          for (int i = 0; i < n; ++i)
1046            a[i * x + j * y] = ...;
1047
1048      where nothing in the way "x" and "y" are set gives a hint as to
1049      whether "i" iterates over the innermost dimension of the array.
1050      In these situations it seems reasonable to assume the
1051      programmer has nested the loops appropriately (although of course
1052      there are examples like GEMM in which this assumption doesn't hold
1053      for all accesses in the loop).
1054
1055      This case is also useful for the Fortran equivalent of the
1056      above C code.  */
1057
1058 void
1059 loop_versioning::analyze_address_fragment (address_info &address)
1060 {
1061   if (dump_enabled_p ())
1062     {
1063       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1064       dump_address_info (MSG_NOTE, address);
1065       dump_printf (MSG_NOTE, "\n");
1066     }
1067
1068   /* Analyze each component of the sum to see whether it involves an
1069      apparent stride.
1070
1071      There is an overlap between the addresses that
1072      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1073      but the former is much cheaper than SCEV analysis, so try it first.  */
1074   for (unsigned int i = 0; i < address.terms.length (); ++i)
1075     if (!find_per_loop_multiplication (address, address.terms[i])
1076         && !analyze_term_using_scevs (address, address.terms[i])
1077         && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1078       analyze_arbitrary_term (address, address.terms[i]);
1079
1080   /* Check for strides that are likely to be for the innermost dimension.
1081
1082      1. If there is a single likely inner stride, if it is an SSA name,
1083         and if it is worth versioning the loop for when the SSA name
1084         equals 1, record that we want to do so.
1085
1086      2. Otherwise, if there any likely inner strides, bail out.  This means
1087         one of:
1088
1089         (a) There are multiple likely inner strides.  This suggests we're
1090             confused and be can't be confident of doing the right thing.
1091
1092         (b) There is a single likely inner stride and it is a constant
1093             rather than an SSA name.  This can mean either that the access
1094             is a natural one without any variable strides, such as:
1095
1096               for (int i = 0; i < n; ++i)
1097                 a[i] += 1;
1098
1099             or that a variable stride is applied to an outer dimension,
1100             such as:
1101
1102               for (int i = 0; i < n; ++i)
1103                 for (int j = 0; j < n; ++j)
1104                   a[j * stride][i] += 1;
1105
1106         (c) There is a single likely inner stride, and it is an SSA name,
1107             but it isn't a worthwhile versioning opportunity.  This usually
1108             means that the variable stride is applied by an outer loop,
1109             such as:
1110
1111               for (int i = 0; i < n; ++i)
1112                 for (int j = 0; j < n; ++j)
1113                   a[j][i * stride] += 1;
1114
1115             or (using an example with a more natural loop nesting):
1116
1117               for (int i = 0; i < n; ++i)
1118                 for (int j = 0; j < n; ++j)
1119                   a[i][j] += b[i * stride];
1120
1121             in cases where b[i * stride] cannot (yet) be hoisted for
1122             aliasing reasons.
1123
1124      3. If there are no likely inner strides, fall through to the next
1125         set of checks.
1126
1127      Pointer equality is enough to check for uniqueness in (1), since we
1128      only care about SSA names.  */
1129   tree chosen_stride = NULL_TREE;
1130   tree version_stride = NULL_TREE;
1131   for (unsigned int i = 0; i < address.terms.length (); ++i)
1132     if (chosen_stride != address.terms[i].stride
1133         && address.terms[i].inner_likelihood == INNER_LIKELY)
1134       {
1135         if (chosen_stride)
1136           return;
1137         chosen_stride = address.terms[i].stride;
1138         if (address.terms[i].versioning_opportunity_p)
1139           version_stride = chosen_stride;
1140       }
1141
1142   /* If there are no likely inner strides, see if there is a single
1143      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1144      See the comment above the function for the cases that this code
1145      handles.  */
1146   if (!chosen_stride)
1147     for (unsigned int i = 0; i < address.terms.length (); ++i)
1148       if (version_stride != address.terms[i].stride
1149           && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1150           && address.terms[i].versioning_opportunity_p)
1151         {
1152           if (version_stride)
1153             return;
1154           version_stride = address.terms[i].stride;
1155         }
1156
1157   if (version_stride)
1158     version_for_unity (address.stmt, version_stride);
1159 }
1160
1161 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1162    TYPE_SIZE bytes and record this address fragment for later processing.
1163    STMT is the statement that contains the address.  */
1164
1165 void
1166 loop_versioning::record_address_fragment (gimple *stmt,
1167                                           unsigned HOST_WIDE_INT type_size,
1168                                           tree expr,
1169                                           unsigned HOST_WIDE_INT multiplier,
1170                                           HOST_WIDE_INT offset)
1171 {
1172   /* We're only interested in computed values.  */
1173   if (TREE_CODE (expr) != SSA_NAME)
1174     return;
1175
1176   /* Quick exit if no part of the address is calculated in STMT's loop,
1177      since such addresses have no versioning opportunities.  */
1178   class loop *loop = loop_containing_stmt (stmt);
1179   if (expr_invariant_in_loop_p (loop, expr))
1180     return;
1181
1182   /* Set up an address_info for EXPR * MULTIPLIER.  */
1183   address_info *address = XOBNEW (&m_obstack, address_info);
1184   new (address) address_info;
1185   address->stmt = stmt;
1186   address->loop = loop;
1187   address->base = NULL_TREE;
1188   address->terms.quick_grow (1);
1189   address->terms[0].expr = expr;
1190   address->terms[0].multiplier = multiplier;
1191   address->terms[0].stride = NULL_TREE;
1192   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1193   address->terms[0].versioning_opportunity_p = false;
1194   address->min_offset = offset;
1195
1196   /* Peel apart the expression into a sum of address_terms, where each
1197      term is multiplied by a constant.  Treat a + b and a - b the same,
1198      since it doesn't matter for our purposes whether an address is
1199      increasing or decreasing.  Distribute (a + b) * constant into
1200      a * constant + b * constant.
1201
1202      We don't care which loop each term belongs to, since we want to
1203      examine as many candidate strides as possible when determining
1204      which is likely to be for the innermost dimension.  We therefore
1205      don't limit the search to statements in STMT's loop.  */
1206   for (unsigned int i = 0; i < address->terms.length (); )
1207     {
1208       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1209         {
1210           tree_code code = gimple_assign_rhs_code (assign);
1211           if (code == PLUS_EXPR
1212               || code == POINTER_PLUS_EXPR
1213               || code == MINUS_EXPR)
1214             {
1215               tree op1 = gimple_assign_rhs1 (assign);
1216               tree op2 = gimple_assign_rhs2 (assign);
1217               if (TREE_CODE (op2) == INTEGER_CST)
1218                 {
1219                   address->terms[i].expr = strip_casts (op1);
1220                   /* This is heuristic only, so don't worry about truncation
1221                      or overflow.  */
1222                   address->min_offset += (TREE_INT_CST_LOW (op2)
1223                                           * address->terms[i].multiplier);
1224                   continue;
1225                 }
1226               else if (address->terms.length () < address_info::MAX_TERMS)
1227                 {
1228                   unsigned int j = address->terms.length ();
1229                   address->terms.quick_push (address->terms[i]);
1230                   address->terms[i].expr = strip_casts (op1);
1231                   address->terms[j].expr = strip_casts (op2);
1232                   continue;
1233                 }
1234             }
1235           if (code == MULT_EXPR)
1236             {
1237               tree op1 = gimple_assign_rhs1 (assign);
1238               tree op2 = gimple_assign_rhs2 (assign);
1239               if (multiply_term_by (address->terms[i], op2))
1240                 {
1241                   address->terms[i].expr = strip_casts (op1);
1242                   continue;
1243                 }
1244             }
1245           if (CONVERT_EXPR_CODE_P (code))
1246             {
1247               tree op1 = gimple_assign_rhs1 (assign);
1248               address->terms[i].expr = strip_casts (op1);
1249               continue;
1250             }
1251         }
1252       i += 1;
1253     }
1254
1255   /* Peel off any symbolic pointer.  */
1256   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1257       && address->terms[0].multiplier == 1)
1258     {
1259       if (address->terms.length () == 1)
1260         {
1261           obstack_free (&m_obstack, address);
1262           return;
1263         }
1264       address->base = address->terms[0].expr;
1265       address->terms.ordered_remove (0);
1266     }
1267
1268   /* Require all remaining terms to be SSA names.  (This could be false
1269      for unfolded statements, but they aren't worth dealing with.)  */
1270   for (unsigned int i = 0; i < address->terms.length (); ++i)
1271     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1272       {
1273         obstack_free (&m_obstack, address);
1274         return;
1275       }
1276
1277   /* The loop above set MIN_OFFSET based on the first byte of the
1278      referenced data.  Calculate the end + 1.  */
1279   address->max_offset = address->min_offset + type_size;
1280
1281   /* Put the terms into a canonical order for the hash table lookup below.  */
1282   address->terms.qsort (compare_address_terms);
1283
1284   if (dump_enabled_p ())
1285     {
1286       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1287       if (multiplier != 1)
1288         dump_printf (MSG_NOTE, " * %wd", multiplier);
1289       dump_printf (MSG_NOTE, " = ");
1290       dump_address_info (MSG_NOTE, *address);
1291       dump_printf (MSG_NOTE, "\n");
1292     }
1293
1294   /* Pool address information with the same terms (but potentially
1295      different offsets).  */
1296   address_info **slot = m_address_table.find_slot (address, INSERT);
1297   if (address_info *old_address = *slot)
1298     {
1299       /* We've already seen an address with the same terms.  Extend the
1300          offset range to account for this access.  Doing this can paper
1301          over gaps, such as in:
1302
1303            a[i * stride * 4] + a[i * stride * 4 + 3];
1304
1305          where nothing references "+ 1" or "+ 2".  However, the vectorizer
1306          handles such gapped accesses without problems, so it's not worth
1307          trying to exclude them.  */
1308       if (old_address->min_offset > address->min_offset)
1309         old_address->min_offset = address->min_offset;
1310       if (old_address->max_offset < address->max_offset)
1311         old_address->max_offset = address->max_offset;
1312       obstack_free (&m_obstack, address);
1313     }
1314   else
1315     {
1316       /* This is the first time we've seen an address with these terms.  */
1317       *slot = address;
1318       m_address_list.safe_push (address);
1319     }
1320 }
1321
1322 /* Analyze expression EXPR, which occurs in STMT.  */
1323
1324 void
1325 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1326 {
1327   unsigned HOST_WIDE_INT type_size;
1328
1329   while (handled_component_p (expr))
1330     {
1331       /* See whether we can use versioning to avoid a multiplication
1332          in an array index.  */
1333       if (TREE_CODE (expr) == ARRAY_REF
1334           && acceptable_type_p (TREE_TYPE (expr), &type_size))
1335         record_address_fragment (stmt, type_size,
1336                                  TREE_OPERAND (expr, 1), type_size, 0);
1337       expr = TREE_OPERAND (expr, 0);
1338     }
1339
1340   /* See whether we can use versioning to avoid a multiplication
1341      in the pointer calculation of a MEM_REF.  */
1342   if (TREE_CODE (expr) == MEM_REF
1343       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1344     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1345                              /* This is heuristic only, so don't worry
1346                                 about truncation or overflow.  */
1347                              TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1348
1349   /* These would be easy to handle if they existed at this stage.  */
1350   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1351 }
1352
1353 /* Analyze all the statements in BB looking for useful version checks.
1354    Return true on success, false if something prevents the block from
1355    being versioned.  */
1356
1357 bool
1358 loop_versioning::analyze_block (basic_block bb)
1359 {
1360   class loop *loop = bb->loop_father;
1361   loop_info &li = get_loop_info (loop);
1362   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1363        gsi_next (&gsi))
1364     {
1365       gimple *stmt = gsi_stmt (gsi);
1366       if (is_gimple_debug (stmt))
1367         continue;
1368
1369       if (expensive_stmt_p (stmt))
1370         {
1371           if (dump_enabled_p ())
1372             dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1373                              " prevents versioning: %G", stmt);
1374           return false;
1375         }
1376
1377       /* Only look for direct versioning opportunities in inner loops
1378          since the benefit tends to be much smaller for outer loops.  */
1379       if (!loop->inner)
1380         {
1381           unsigned int nops = gimple_num_ops (stmt);
1382           for (unsigned int i = 0; i < nops; ++i)
1383             if (tree op = gimple_op (stmt, i))
1384               analyze_expr (stmt, op);
1385         }
1386
1387       /* The point of the instruction limit is to prevent excessive
1388          code growth, so this is a size-based estimate even though
1389          the optimization is aimed at speed.  */
1390       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1391     }
1392
1393   return true;
1394 }
1395
1396 /* Analyze all the blocks in the function, looking for useful version checks.
1397    Return true if we found one.  */
1398
1399 bool
1400 loop_versioning::analyze_blocks ()
1401 {
1402   AUTO_DUMP_SCOPE ("analyze_blocks",
1403                    dump_user_location_t::from_function_decl (m_fn->decl));
1404
1405   /* For now we don't try to version the whole function, although
1406      versioning at that level could be useful in some cases.  */
1407   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1408
1409   for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1410     {
1411       loop_info &linfo = get_loop_info (loop);
1412
1413       /* Ignore cold loops.  */
1414       if (!optimize_loop_for_speed_p (loop))
1415         linfo.rejected_p = true;
1416
1417       /* See whether an inner loop prevents versioning of this loop.  */
1418       if (!linfo.rejected_p)
1419         for (class loop *inner = loop->inner; inner; inner = inner->next)
1420           if (get_loop_info (inner).rejected_p)
1421             {
1422               linfo.rejected_p = true;
1423               break;
1424             }
1425
1426       /* If versioning the loop is still a possibility, examine the
1427          statements in the loop to look for versioning opportunities.  */
1428       if (!linfo.rejected_p)
1429         {
1430           void *start_point = obstack_alloc (&m_obstack, 0);
1431
1432           for (basic_block bb = linfo.block_list; bb;
1433                bb = m_next_block_in_loop[bb->index])
1434             if (!analyze_block (bb))
1435               {
1436                 linfo.rejected_p = true;
1437                 break;
1438               }
1439
1440           if (!linfo.rejected_p)
1441             {
1442               /* Process any queued address fragments, now that we have
1443                  complete grouping information.  */
1444               address_info *address;
1445               unsigned int i;
1446               FOR_EACH_VEC_ELT (m_address_list, i, address)
1447                 analyze_address_fragment (*address);
1448             }
1449
1450           m_address_table.empty ();
1451           m_address_list.truncate (0);
1452           obstack_free (&m_obstack, start_point);
1453         }
1454     }
1455
1456   return m_num_conditions != 0;
1457 }
1458
1459 /* Use the ranges in VRS to remove impossible versioning conditions from
1460    LOOP.  */
1461
1462 void
1463 loop_versioning::prune_loop_conditions (class loop *loop)
1464 {
1465   loop_info &li = get_loop_info (loop);
1466
1467   int to_remove = -1;
1468   bitmap_iterator bi;
1469   unsigned int i;
1470   int_range_max r;
1471   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1472     {
1473       tree name = ssa_name (i);
1474       gimple *stmt = first_stmt (loop->header);
1475
1476       if (get_range_query (cfun)->range_of_expr (r, name, stmt)
1477           && !r.contains_p (wi::one (TYPE_PRECISION (TREE_TYPE (name)))))
1478         {
1479           if (dump_enabled_p ())
1480             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1481                              "%T can never be 1 in this loop\n", name);
1482
1483           if (to_remove >= 0)
1484             bitmap_clear_bit (&li.unity_names, to_remove);
1485           to_remove = i;
1486           m_num_conditions -= 1;
1487         }
1488     }
1489   if (to_remove >= 0)
1490     bitmap_clear_bit (&li.unity_names, to_remove);
1491 }
1492
1493 /* Remove any scheduled loop version conditions that will never be true.
1494    Return true if any remain.  */
1495
1496 bool
1497 loop_versioning::prune_conditions ()
1498 {
1499   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1500                    dump_user_location_t::from_function_decl (m_fn->decl));
1501
1502   calculate_dominance_info (CDI_DOMINATORS);
1503   lv_dom_walker dom_walker (*this);
1504   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1505   return m_num_conditions != 0;
1506 }
1507
1508 /* Merge the version checks for INNER into immediately-enclosing loop
1509    OUTER.  */
1510
1511 void
1512 loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1513 {
1514   loop_info &inner_li = get_loop_info (inner);
1515   loop_info &outer_li = get_loop_info (outer);
1516
1517   if (dump_enabled_p ())
1518     {
1519       bitmap_iterator bi;
1520       unsigned int i;
1521       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1522         if (!bitmap_bit_p (&outer_li.unity_names, i))
1523           dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1524                            "hoisting check that %T == 1 to outer loop\n",
1525                            ssa_name (i));
1526     }
1527
1528   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1529   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1530     outer_li.outermost = inner_li.outermost;
1531 }
1532
1533 /* Add LOOP to the queue of loops to version.  */
1534
1535 void
1536 loop_versioning::add_loop_to_queue (class loop *loop)
1537 {
1538   loop_info &li = get_loop_info (loop);
1539
1540   if (dump_enabled_p ())
1541     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1542                      "queuing this loop for versioning\n");
1543   m_loops_to_version.safe_push (loop);
1544
1545   /* Don't try to version superloops.  */
1546   li.rejected_p = true;
1547 }
1548
1549 /* Decide whether the cost model would allow us to version LOOP,
1550    either directly or as part of a parent loop, and return true if so.
1551    This does not imply that the loop is actually worth versioning in its
1552    own right, just that it would be valid to version it if something
1553    benefited.
1554
1555    We have already made this decision for all inner loops of LOOP.  */
1556
1557 bool
1558 loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1559 {
1560   loop_info &li = get_loop_info (loop);
1561
1562   if (li.rejected_p)
1563     return false;
1564
1565   /* Examine the decisions made for inner loops.  */
1566   for (class loop *inner = loop->inner; inner; inner = inner->next)
1567     {
1568       loop_info &inner_li = get_loop_info (inner);
1569       if (inner_li.rejected_p)
1570         {
1571           if (dump_enabled_p ())
1572             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1573                              "not versioning this loop because one of its"
1574                              " inner loops should not be versioned\n");
1575           return false;
1576         }
1577
1578       if (inner_li.worth_versioning_p ())
1579         li.subloops_benefit_p = true;
1580
1581       /* Accumulate the number of instructions from subloops that are not
1582          the innermost, or that don't benefit from versioning.  Only the
1583          instructions from innermost loops that benefit from versioning
1584          should be weighed against loop-versioning-max-inner-insns;
1585          everything else should be weighed against
1586          loop-versioning-max-outer-insns.  */
1587       if (!inner_li.worth_versioning_p () || inner->inner)
1588         {
1589           if (dump_enabled_p ())
1590             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1591                              "counting %d instructions from this loop"
1592                              " against its parent loop\n", inner_li.num_insns);
1593           li.num_insns += inner_li.num_insns;
1594         }
1595     }
1596
1597   /* Enforce the size limits.  */
1598   if (li.worth_versioning_p ())
1599     {
1600       unsigned int max_num_insns = max_insns_for_loop (loop);
1601       if (dump_enabled_p ())
1602         dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1603                          "this loop has %d instructions, against"
1604                          " a versioning limit of %d\n",
1605                          li.num_insns, max_num_insns);
1606       if (li.num_insns > max_num_insns)
1607         {
1608           if (dump_enabled_p ())
1609             dump_printf_loc (MSG_MISSED_OPTIMIZATION
1610                              | MSG_PRIORITY_USER_FACING,
1611                              find_loop_location (loop),
1612                              "this loop is too big to version");
1613           return false;
1614         }
1615     }
1616
1617   /* Hoist all version checks from subloops to this loop.  */
1618   for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1619     merge_loop_info (loop, subloop);
1620
1621   return true;
1622 }
1623
1624 /* Decide which loops to version and add them to the versioning queue.
1625    Return true if there are any loops to version.  */
1626
1627 bool
1628 loop_versioning::make_versioning_decisions ()
1629 {
1630   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1631                    dump_user_location_t::from_function_decl (m_fn->decl));
1632
1633   for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1634     {
1635       loop_info &linfo = get_loop_info (loop);
1636       if (decide_whether_loop_is_versionable (loop))
1637         {
1638           /* Commit to versioning LOOP directly if we can't hoist the
1639              version checks any further.  */
1640           if (linfo.worth_versioning_p ()
1641               && (loop_depth (loop) == 1 || linfo.outermost == loop))
1642             add_loop_to_queue (loop);
1643         }
1644       else
1645         {
1646           /* We can't version this loop, so individually version any
1647              subloops that would benefit and haven't been versioned yet.  */
1648           linfo.rejected_p = true;
1649           for (class loop *subloop = loop->inner; subloop;
1650                subloop = subloop->next)
1651             if (get_loop_info (subloop).worth_versioning_p ())
1652               add_loop_to_queue (subloop);
1653         }
1654     }
1655
1656   return !m_loops_to_version.is_empty ();
1657 }
1658
1659 /* Attempt to implement loop versioning for LOOP, using the information
1660    cached in the associated loop_info.  Return true on success.  */
1661
1662 bool
1663 loop_versioning::version_loop (class loop *loop)
1664 {
1665   loop_info &li = get_loop_info (loop);
1666
1667   /* Build up a condition that selects the original loop instead of
1668      the simplified loop.  */
1669   tree cond = boolean_false_node;
1670   bitmap_iterator bi;
1671   unsigned int i;
1672   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1673     {
1674       tree name = ssa_name (i);
1675       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1676                                  build_one_cst (TREE_TYPE (name)));
1677       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1678     }
1679
1680   /* Convert the condition into a suitable gcond.  */
1681   gimple_seq stmts = NULL;
1682   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr_for_cond,
1683                                  NULL_TREE);
1684
1685   /* Version the loop.  */
1686   initialize_original_copy_tables ();
1687   basic_block cond_bb;
1688   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1689                                     profile_probability::unlikely (),
1690                                     profile_probability::likely (),
1691                                     profile_probability::unlikely (),
1692                                     profile_probability::likely (), true);
1693   free_original_copy_tables ();
1694   if (!li.optimized_loop)
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1698                          "tried but failed to version this loop for when"
1699                          " certain strides are 1\n");
1700       return false;
1701     }
1702
1703   if (dump_enabled_p ())
1704     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1705                      "versioned this loop for when certain strides are 1\n");
1706
1707   /* Insert the statements that feed COND.  */
1708   if (stmts)
1709     {
1710       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1711       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1712     }
1713
1714   return true;
1715 }
1716
1717 /* Attempt to version all loops in the versioning queue.  */
1718
1719 void
1720 loop_versioning::implement_versioning_decisions ()
1721 {
1722   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1723      user-facing at this point.  */
1724
1725   bool any_succeeded_p = false;
1726   class loop *loop;
1727   unsigned int i;
1728   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1729     if (version_loop (loop))
1730       any_succeeded_p = true;
1731   if (!any_succeeded_p)
1732     return;
1733
1734   update_ssa (TODO_update_ssa);
1735
1736   /* Simplify the new loop, which is used when COND is false.  */
1737   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1738     {
1739       loop_info &linfo = get_loop_info (loop);
1740       if (linfo.optimized_loop)
1741         name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1742     }
1743 }
1744
1745 /* Run the pass and return a set of TODO_* flags.  */
1746
1747 unsigned int
1748 loop_versioning::run ()
1749 {
1750   gcc_assert (scev_initialized_p ());
1751
1752   if (analyze_blocks ()
1753       && prune_conditions ()
1754       && make_versioning_decisions ())
1755     implement_versioning_decisions ();
1756
1757   return 0;
1758 }
1759
1760 /* Loop versioning pass.  */
1761
1762 const pass_data pass_data_loop_versioning =
1763 {
1764   GIMPLE_PASS, /* type */
1765   "lversion", /* name */
1766   OPTGROUP_LOOP, /* optinfo_flags */
1767   TV_LOOP_VERSIONING, /* tv_id */
1768   PROP_cfg, /* properties_required */
1769   0, /* properties_provided */
1770   0, /* properties_destroyed */
1771   0, /* todo_flags_start */
1772   0, /* todo_flags_finish */
1773 };
1774
1775 class pass_loop_versioning : public gimple_opt_pass
1776 {
1777 public:
1778   pass_loop_versioning (gcc::context *ctxt)
1779     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1780   {}
1781
1782   /* opt_pass methods: */
1783   bool gate (function *) final override
1784   {
1785     return flag_version_loops_for_strides;
1786   }
1787   unsigned int execute (function *) final override;
1788 };
1789
1790 unsigned int
1791 pass_loop_versioning::execute (function *fn)
1792 {
1793   if (number_of_loops (fn) <= 1)
1794     return 0;
1795
1796   enable_ranger (fn);
1797   unsigned int ret = loop_versioning (fn).run ();
1798   disable_ranger (fn);
1799   return ret;
1800 }
1801
1802 } // anon namespace
1803
1804 gimple_opt_pass *
1805 make_pass_loop_versioning (gcc::context *ctxt)
1806 {
1807   return new pass_loop_versioning (ctxt);
1808 }