gcc/loop-doloop.cc

   1 /* Perform doloop optimizations
   2    Copyright (C) 2004-2025 Free Software Foundation, Inc.
   3    Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "target.h"
  26 #include "rtl.h"
  27 #include "tree.h"
  28 #include "cfghooks.h"
  29 #include "memmodel.h"
  30 #include "emit-rtl.h"
  31 #include "dojump.h"
  32 #include "expr.h"
  33 #include "cfgloop.h"
  34 #include "cfgrtl.h"
  35 #include "dumpfile.h"
  36 #include "loop-unroll.h"
  37 #include "regs.h"
  38 #include "df.h"
  39 #include "targhooks.h"
  40
  41 /* This module is used to modify loops with a determinable number of
  42    iterations to use special low-overhead looping instructions.
  43
  44    It first validates whether the loop is well behaved and has a
  45    determinable number of iterations (either at compile or run-time).
  46    It then modifies the loop to use a low-overhead looping pattern as
  47    follows:
  48
  49    1. A pseudo register is allocated as the loop iteration counter.
  50
  51    2. The number of loop iterations is calculated and is stored
  52       in the loop counter.
  53
  54    3. At the end of the loop, the jump insn is replaced by the
  55       doloop_end pattern.  The compare must remain because it might be
  56       used elsewhere.  If the loop-variable or condition register are
  57       used elsewhere, they will be eliminated by flow.
  58
  59    4. An optional doloop_begin pattern is inserted at the top of the
  60       loop.
  61
  62    TODO The optimization should only performed when either the biv used for exit
  63    condition is unused at all except for the exit test, or if we do not have to
  64    change its value, since otherwise we have to add a new induction variable,
  65    which usually will not pay up (unless the cost of the doloop pattern is
  66    somehow extremely lower than the cost of compare & jump, or unless the bct
  67    register cannot be used for anything else but doloop -- ??? detect these
  68    cases).  */
  69
  70 /* Return the loop termination condition for PATTERN or zero
  71    if it is not a decrement and branch jump insn.  */
  72
  73 rtx
  74 doloop_condition_get (rtx_insn *doloop_pat)
  75 {
  76   rtx cmp;
  77   rtx inc;
  78   rtx reg;
  79   rtx inc_src;
  80   rtx condition;
  81   rtx pattern;
  82   rtx cc_reg = NULL_RTX;
  83   rtx reg_orig = NULL_RTX;
  84
  85   /* The canonical doloop pattern we expect has one of the following
  86      forms:
  87
  88      1)  (parallel [(set (pc) (if_then_else (condition)
  89                                             (label_ref (label))
  90                                             (pc)))
  91                      (set (reg) (plus (reg) (const_int -1)))
  92                      (additional clobbers and uses)])
  93
  94      The branch must be the first entry of the parallel (also required
  95      by jump.cc), and the second entry of the parallel must be a set of
  96      the loop counter register.  Some targets (IA-64) wrap the set of
  97      the loop counter in an if_then_else too.
  98
  99      2)  (set (reg) (plus (reg) (const_int -1))
 100          (set (pc) (if_then_else (reg != 0)
 101                                  (label_ref (label))
 102                                  (pc))).
 103
 104      3) Some targets (Arm) do the comparison before the branch, as in the
 105      following form:
 106
 107      (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
 108                 (set (reg) (plus (reg) (const_int -1)))])
 109      (set (pc) (if_then_else (cc == NE)
 110                              (label_ref (label))
 111                              (pc)))
 112
 113       4) This form supports a construct that is used to represent a vectorized
 114       do loop with predication, however we do not need to care about the
 115       details of the predication here.
 116       Arm uses this construct to support MVE tail predication.
 117
 118       (parallel
 119        [(set (pc)
 120              (if_then_else (gtu (plus (reg) (const_int -n))
 121                                 (const_int n-1))
 122                            (label_ref)
 123                            (pc)))
 124         (set (reg) (plus (reg) (const_int -n)))
 125         (additional clobbers and uses)])
 126      */
 127   pattern = PATTERN (doloop_pat);
 128
 129   if (GET_CODE (pattern) != PARALLEL)
 130     {
 131       rtx cond;
 132       rtx_insn *prev_insn = prev_nondebug_insn (doloop_pat);
 133       rtx cmp_arg1, cmp_arg2;
 134       rtx cmp_orig;
 135
 136       /* In case the pattern is not PARALLEL we expect two forms
 137          of doloop which are cases 2) and 3) above: in case 2) the
 138          decrement immediately precedes the branch, while in case 3)
 139          the compare and decrement instructions immediately precede
 140          the branch.  */
 141
 142       if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
 143         return 0;
 144
 145       cmp = pattern;
 146       if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
 147         {
 148           /* The third case: the compare and decrement instructions
 149              immediately precede the branch.  */
 150           cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
 151           if (GET_CODE (cmp_orig) != SET)
 152             return 0;
 153           if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
 154             return 0;
 155           cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
 156           cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
 157           if (cmp_arg2 != const0_rtx
 158               || GET_CODE (cmp_arg1) != PLUS)
 159             return 0;
 160           reg_orig = XEXP (cmp_arg1, 0);
 161           if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
 162               || !REG_P (reg_orig))
 163             return 0;
 164           cc_reg = SET_DEST (cmp_orig);
 165
 166           inc = XVECEXP (PATTERN (prev_insn), 0, 1);
 167         }
 168       else
 169         inc = PATTERN (prev_insn);
 170       if (GET_CODE (cmp) == SET && GET_CODE (SET_SRC (cmp)) == IF_THEN_ELSE)
 171         {
 172           /* We expect the condition to be of the form (reg != 0)  */
 173           cond = XEXP (SET_SRC (cmp), 0);
 174           if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
 175             return 0;
 176         }
 177     }
 178   else
 179     {
 180       cmp = XVECEXP (pattern, 0, 0);
 181       inc = XVECEXP (pattern, 0, 1);
 182     }
 183
 184   /* Check for (set (reg) (something)).  */
 185   if (GET_CODE (inc) != SET)
 186     return 0;
 187   reg = SET_DEST (inc);
 188   if (! REG_P (reg))
 189     return 0;
 190
 191   /* Check if something = (plus (reg) (const_int -n)).
 192      On IA-64, this decrement is wrapped in an if_then_else.  */
 193   inc_src = SET_SRC (inc);
 194   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 195     inc_src = XEXP (inc_src, 1);
 196   if (GET_CODE (inc_src) != PLUS
 197       || !rtx_equal_p (XEXP (inc_src, 0), reg)
 198       || !CONST_INT_P (XEXP (inc_src, 1))
 199       || INTVAL (XEXP (inc_src, 1)) >= 0)
 200     return 0;
 201   int dec_num = -INTVAL (XEXP (inc_src, 1));
 202
 203   /* Check for (set (pc) (if_then_else (condition)
 204                                        (label_ref (label))
 205                                        (pc))).  */
 206   if (GET_CODE (cmp) != SET
 207       || SET_DEST (cmp) != pc_rtx
 208       || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
 209       || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
 210       || XEXP (SET_SRC (cmp), 2) != pc_rtx)
 211     return 0;
 212
 213   /* Extract loop termination condition.  */
 214   condition = XEXP (SET_SRC (cmp), 0);
 215
 216   /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with
 217      dec_num - 1.  */
 218   if (!((GET_CODE (condition) == GE
 219          || GET_CODE (condition) == NE)
 220         && (XEXP (condition, 1) == const0_rtx
 221             || XEXP (condition, 1) == const1_rtx ))
 222       &&!(GET_CODE (condition) == GTU
 223           && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))
 224     return 0;
 225
 226   if (rtx_equal_p (XEXP (condition, 0), reg)
 227       /* For the third case:  */
 228       || ((cc_reg != NULL_RTX)
 229           && (XEXP (condition, 0) == cc_reg)
 230           && (rtx_equal_p (reg_orig, reg)))
 231       || (GET_CODE (XEXP (condition, 0)) == PLUS
 232           && rtx_equal_p (XEXP (XEXP (condition, 0), 0), reg, NULL)))
 233     {
 234       if (GET_CODE (pattern) != PARALLEL)
 235       /* For the second form we expect:
 236
 237          (set (reg) (plus (reg) (const_int -1))
 238          (set (pc) (if_then_else (reg != 0)
 239                                  (label_ref (label))
 240                                  (pc))).
 241
 242          is equivalent to the following:
 243
 244          (parallel [(set (pc) (if_then_else (reg != 1)
 245                                             (label_ref (label))
 246                                             (pc)))
 247                      (set (reg) (plus (reg) (const_int -1)))
 248                      (additional clobbers and uses)])
 249
 250         For the third form we expect:
 251
 252         (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
 253                    (set (reg) (plus (reg) (const_int -1)))])
 254         (set (pc) (if_then_else (cc == NE)
 255                                 (label_ref (label))
 256                                 (pc)))
 257
 258         which is equivalent to the following:
 259
 260         (parallel [(set (cc) (compare (reg,  1))
 261                    (set (reg) (plus (reg) (const_int -1)))
 262                    (set (pc) (if_then_else (NE == cc)
 263                                            (label_ref (label))
 264                                            (pc))))])
 265
 266         So we return the second form instead for the two cases.
 267
 268      */
 269         condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
 270
 271     return condition;
 272     }
 273
 274   /* ??? If a machine uses a funny comparison, we could return a
 275      canonicalized form here.  */
 276
 277   return 0;
 278 }
 279
 280 /* Return nonzero if the loop specified by LOOP is suitable for
 281    the use of special low-overhead looping instructions.  DESC
 282    describes the number of iterations of the loop.  */
 283
 284 static bool
 285 doloop_valid_p (class loop *loop, class niter_desc *desc)
 286 {
 287   basic_block *body = get_loop_body (loop), bb;
 288   rtx_insn *insn;
 289   unsigned i;
 290   bool result = true;
 291
 292   /* Check for loops that may not terminate under special conditions.  */
 293   if (!desc->simple_p
 294       || desc->assumptions
 295       || desc->infinite)
 296     {
 297       /* There are some cases that would require a special attention.
 298          For example if the comparison is LEU and the comparison value
 299          is UINT_MAX then the loop will not terminate.  Similarly, if the
 300          comparison code is GEU and the comparison value is 0, the
 301          loop will not terminate.
 302
 303          If the absolute increment is not 1, the loop can be infinite
 304          even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
 305
 306          ??? We could compute these conditions at run-time and have a
 307          additional jump around the loop to ensure an infinite loop.
 308          However, it is very unlikely that this is the intended
 309          behavior of the loop and checking for these rare boundary
 310          conditions would pessimize all other code.
 311
 312          If the loop is executed only a few times an extra check to
 313          restart the loop could use up most of the benefits of using a
 314          count register loop.  Note however, that normally, this
 315          restart branch would never execute, so it could be predicted
 316          well by the CPU.  We should generate the pessimistic code by
 317          default, and have an option, e.g. -funsafe-loops that would
 318          enable count-register loops in this case.  */
 319       if (dump_file)
 320         fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
 321       result = false;
 322       goto cleanup;
 323     }
 324
 325   for (i = 0; i < loop->num_nodes; i++)
 326     {
 327       bb = body[i];
 328
 329       for (insn = BB_HEAD (bb);
 330            insn != NEXT_INSN (BB_END (bb));
 331            insn = NEXT_INSN (insn))
 332         {
 333           /* Different targets have different necessities for low-overhead
 334              looping.  Call the back end for each instruction within the loop
 335              to let it decide whether the insn prohibits a low-overhead loop.
 336              It will then return the cause for it to emit to the dump file.  */
 337           const char * invalid = targetm.invalid_within_doloop (insn);
 338           if (invalid)
 339             {
 340               if (dump_file)
 341                 fprintf (dump_file, "Doloop: %s\n", invalid);
 342               result = false;
 343               goto cleanup;
 344             }
 345         }
 346     }
 347   result = true;
 348
 349 cleanup:
 350   free (body);
 351
 352   return result;
 353 }
 354
 355 /* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
 356    edge.  If the condition is always false, do not do anything.  If it is always
 357    true, redirect E to DEST and return false.  In all other cases, true is
 358    returned.  */
 359
 360 static bool
 361 add_test (rtx cond, edge *e, basic_block dest)
 362 {
 363   rtx_insn *seq, *jump;
 364   rtx_code_label *label;
 365   machine_mode mode;
 366   rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
 367   enum rtx_code code = GET_CODE (cond);
 368   basic_block bb;
 369   /* The jump is supposed to handle an unlikely special case.  */
 370   profile_probability prob = profile_probability::guessed_never ();
 371
 372   mode = GET_MODE (XEXP (cond, 0));
 373   if (mode == VOIDmode)
 374     mode = GET_MODE (XEXP (cond, 1));
 375
 376   start_sequence ();
 377   op0 = force_operand (op0, NULL_RTX);
 378   op1 = force_operand (op1, NULL_RTX);
 379   label = block_label (dest);
 380   do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, NULL, label,
 381                            prob);
 382
 383   jump = get_last_insn ();
 384   if (!jump || !JUMP_P (jump))
 385     {
 386       /* The condition is always false and the jump was optimized out.  */
 387       end_sequence ();
 388       return true;
 389     }
 390
 391   seq = get_insns ();
 392   unshare_all_rtl_in_chain (seq);
 393   end_sequence ();
 394
 395   /* There always is at least the jump insn in the sequence.  */
 396   gcc_assert (seq != NULL_RTX);
 397
 398   bb = split_edge_and_insert (*e, seq);
 399   *e = single_succ_edge (bb);
 400
 401   if (any_uncondjump_p (jump) && onlyjump_p (jump))
 402     {
 403       /* The condition is always true.  */
 404       delete_insn (jump);
 405       redirect_edge_and_branch_force (*e, dest);
 406       return false;
 407     }
 408
 409   JUMP_LABEL (jump) = label;
 410
 411   LABEL_NUSES (label)++;
 412
 413   edge e2 = make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
 414   e2->probability = prob;
 415   (*e)->probability = prob.invert ();
 416   update_br_prob_note (e2->src);
 417   return true;
 418 }
 419
 420 /* Fold (add -1; zero_ext; add +1) operations to zero_ext if not wrapping. i.e:
 421
 422    73: r145:SI=r123:DI#0-0x1
 423    74: r144:DI=zero_extend (r145:SI)
 424    75: r143:DI=r144:DI+0x1
 425    ...
 426    31: r135:CC=cmp (r123:DI,0)
 427    72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;...}
 428
 429    r123:DI#0-0x1 is param count derived from loop->niter_expr equal to number of
 430    loop iterations, if loop iterations expression doesn't overflow, then
 431    (zero_extend (r123:DI#0-1))+1 can be simplified to zero_extend.  */
 432
 433 static rtx
 434 doloop_simplify_count (class loop *loop, scalar_int_mode mode, rtx count)
 435 {
 436   widest_int iterations;
 437   if (GET_CODE (count) == ZERO_EXTEND)
 438     {
 439       rtx extop0 = XEXP (count, 0);
 440       if (GET_CODE (extop0) == PLUS)
 441         {
 442           rtx addop0 = XEXP (extop0, 0);
 443           rtx addop1 = XEXP (extop0, 1);
 444
 445           if (get_max_loop_iterations (loop, &iterations)
 446               && wi::ltu_p (iterations, GET_MODE_MASK (GET_MODE (addop0)))
 447               && addop1 == constm1_rtx)
 448             return simplify_gen_unary (ZERO_EXTEND, mode, addop0,
 449                                        GET_MODE (addop0));
 450         }
 451     }
 452
 453   return simplify_gen_binary (PLUS, mode, count, const1_rtx);
 454 }
 455
 456 /* Modify the loop to use the low-overhead looping insn where LOOP
 457    describes the loop, DESC describes the number of iterations of the
 458    loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
 459    end of the loop.  CONDITION is the condition separated from the
 460    DOLOOP_SEQ.  COUNT is the number of iterations of the LOOP.  */
 461
 462 static void
 463 doloop_modify (class loop *loop, class niter_desc *desc,
 464                rtx_insn *doloop_seq, rtx condition, rtx count)
 465 {
 466   rtx counter_reg;
 467   rtx tmp, noloop = NULL_RTX;
 468   rtx_insn *sequence;
 469   rtx_insn *jump_insn;
 470   rtx_code_label *jump_label;
 471   int nonneg = 0;
 472   bool increment_count;
 473   basic_block loop_end = desc->out_edge->src;
 474   scalar_int_mode mode;
 475   widest_int iterations;
 476
 477   jump_insn = BB_END (loop_end);
 478
 479   if (dump_file)
 480     {
 481       fprintf (dump_file, "Doloop: Inserting doloop pattern (");
 482       if (desc->const_iter)
 483         fprintf (dump_file, "%" PRId64, desc->niter);
 484       else
 485         fputs ("runtime", dump_file);
 486       fputs (" iterations).\n", dump_file);
 487     }
 488
 489   /* Discard original jump to continue loop.  The original compare
 490      result may still be live, so it cannot be discarded explicitly.  */
 491   delete_insn (jump_insn);
 492
 493   counter_reg = XEXP (condition, 0);
 494   if (GET_CODE (counter_reg) == PLUS)
 495     counter_reg = XEXP (counter_reg, 0);
 496   /* These patterns must operate on integer counters.  */
 497   mode = as_a <scalar_int_mode> (GET_MODE (counter_reg));
 498
 499   increment_count = false;
 500   switch (GET_CODE (condition))
 501     {
 502     case NE:
 503       /* Currently only NE tests against zero and one are supported.  */
 504       noloop = XEXP (condition, 1);
 505       if (noloop != const0_rtx)
 506         {
 507           gcc_assert (noloop == const1_rtx);
 508           increment_count = true;
 509         }
 510       break;
 511
 512     case GE:
 513       /* Currently only GE tests against zero are supported.  */
 514       gcc_assert (XEXP (condition, 1) == const0_rtx);
 515
 516       noloop = constm1_rtx;
 517
 518       /* The iteration count does not need incrementing for a GE test.  */
 519       increment_count = false;
 520
 521       /* Determine if the iteration counter will be non-negative.
 522          Note that the maximum value loaded is iterations_max - 1.  */
 523       if (get_max_loop_iterations (loop, &iterations)
 524           && wi::leu_p (iterations,
 525                         wi::set_bit_in_zero <widest_int>
 526                         (GET_MODE_PRECISION (mode) - 1)))
 527         nonneg = 1;
 528       break;
 529
 530     case GTU:
 531       /* The iteration count does not need incrementing for a GTU test.  */
 532       increment_count = false;
 533       break;
 534
 535       /* Abort if an invalid doloop pattern has been generated.  */
 536     default:
 537       gcc_unreachable ();
 538     }
 539
 540   if (increment_count)
 541     count = doloop_simplify_count (loop, mode, count);
 542
 543   /* Insert initialization of the count register into the loop header.  */
 544   start_sequence ();
 545   /* count has been already copied through copy_rtx.  */
 546   reset_used_flags (count);
 547   set_used_flags (condition);
 548   tmp = force_operand (count, counter_reg);
 549   convert_move (counter_reg, tmp, 1);
 550   sequence = get_insns ();
 551   unshare_all_rtl_in_chain (sequence);
 552   end_sequence ();
 553   emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
 554
 555   if (desc->noloop_assumptions)
 556     {
 557       /* The GTU case has only been implemented for Arm, where
 558          noloop_assumptions gets explicitly set to NULL for that case, so
 559          assert here for safety.  */
 560       gcc_assert (GET_CODE (condition) != GTU);
 561       rtx ass = copy_rtx (desc->noloop_assumptions);
 562       basic_block preheader = loop_preheader_edge (loop)->src;
 563       basic_block set_zero = split_edge (loop_preheader_edge (loop));
 564       basic_block new_preheader = split_edge (loop_preheader_edge (loop));
 565       edge te;
 566
 567       /* Expand the condition testing the assumptions and if it does not pass,
 568          reset the count register to 0.  */
 569       redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
 570       set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
 571
 572       set_zero->count = profile_count::uninitialized ();
 573
 574       te = single_succ_edge (preheader);
 575       for (; ass; ass = XEXP (ass, 1))
 576         if (!add_test (XEXP (ass, 0), &te, set_zero))
 577           break;
 578
 579       if (ass)
 580         {
 581           /* We reached a condition that is always true.  This is very hard to
 582              reproduce (such a loop does not roll, and thus it would most
 583              likely get optimized out by some of the preceding optimizations).
 584              In fact, I do not have any testcase for it.  However, it would
 585              also be very hard to show that it is impossible, so we must
 586              handle this case.  */
 587           set_zero->count = preheader->count;
 588         }
 589
 590       if (EDGE_COUNT (set_zero->preds) == 0)
 591         {
 592           /* All the conditions were simplified to false, remove the
 593              unreachable set_zero block.  */
 594           delete_basic_block (set_zero);
 595         }
 596       else
 597         {
 598           /* Reset the counter to zero in the set_zero block.  */
 599           start_sequence ();
 600           convert_move (counter_reg, noloop, 0);
 601           sequence = get_insns ();
 602           end_sequence ();
 603           emit_insn_after (sequence, BB_END (set_zero));
 604
 605           set_immediate_dominator (CDI_DOMINATORS, set_zero,
 606                                    recompute_dominator (CDI_DOMINATORS,
 607                                                         set_zero));
 608         }
 609
 610       set_immediate_dominator (CDI_DOMINATORS, new_preheader,
 611                                recompute_dominator (CDI_DOMINATORS,
 612                                                     new_preheader));
 613     }
 614
 615   /* Some targets (eg, C4x) need to initialize special looping
 616      registers.  */
 617   if (targetm.have_doloop_begin ())
 618     if (rtx_insn *seq = targetm.gen_doloop_begin (counter_reg, doloop_seq))
 619       emit_insn_after (seq, BB_END (loop_preheader_edge (loop)->src));
 620
 621   /* Insert the new low-overhead looping insn.  */
 622   emit_jump_insn_after (doloop_seq, BB_END (loop_end));
 623   jump_insn = BB_END (loop_end);
 624   jump_label = block_label (desc->in_edge->dest);
 625   JUMP_LABEL (jump_insn) = jump_label;
 626   LABEL_NUSES (jump_label)++;
 627
 628   /* Ensure the right fallthru edge is marked, for case we have reversed
 629      the condition.  */
 630   desc->in_edge->flags &= ~EDGE_FALLTHRU;
 631   desc->out_edge->flags |= EDGE_FALLTHRU;
 632
 633   /* Add a REG_NONNEG note if the actual or estimated maximum number
 634      of iterations is non-negative.  */
 635   if (nonneg)
 636     add_reg_note (jump_insn, REG_NONNEG, NULL_RTX);
 637
 638   /* Update the REG_BR_PROB note.  */
 639   if (desc->in_edge->probability.initialized_p ())
 640     add_reg_br_prob_note (jump_insn, desc->in_edge->probability);
 641 }
 642
 643 /* Called through note_stores.  */
 644
 645 static void
 646 record_reg_sets (rtx x, const_rtx pat ATTRIBUTE_UNUSED, void *data)
 647 {
 648   bitmap mod = (bitmap)data;
 649   if (REG_P (x))
 650     {
 651       unsigned int regno = REGNO (x);
 652       if (HARD_REGISTER_P (x))
 653         {
 654           unsigned int end_regno = end_hard_regno (GET_MODE (x), regno);
 655           do
 656             bitmap_set_bit (mod, regno);
 657           while (++regno < end_regno);
 658         }
 659       else
 660         bitmap_set_bit (mod, regno);
 661     }
 662 }
 663
 664 /* Process loop described by LOOP validating that the loop is suitable for
 665    conversion to use a low overhead looping instruction, replacing the jump
 666    insn where suitable.  Returns true if the loop was successfully
 667    modified.  */
 668
 669 static bool
 670 doloop_optimize (class loop *loop)
 671 {
 672   scalar_int_mode mode;
 673   rtx doloop_reg;
 674   rtx count = NULL_RTX;
 675   widest_int iterations, iterations_max;
 676   rtx_code_label *start_label;
 677   rtx condition;
 678   unsigned level;
 679   HOST_WIDE_INT est_niter;
 680   int max_cost;
 681   class niter_desc *desc;
 682   unsigned word_mode_size;
 683   unsigned HOST_WIDE_INT word_mode_max;
 684   int entered_at_top;
 685
 686   if (dump_file)
 687     fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
 688
 689   iv_analysis_loop_init (loop);
 690
 691   /* Find the simple exit of a LOOP.  */
 692   desc = get_simple_loop_desc (loop);
 693
 694   /* Check that loop is a candidate for a low-overhead looping insn.  */
 695   if (!doloop_valid_p (loop, desc))
 696     {
 697       if (dump_file)
 698         fprintf (dump_file,
 699                  "Doloop: The loop is not suitable.\n");
 700       return false;
 701     }
 702   mode = desc->mode;
 703
 704   est_niter = get_estimated_loop_iterations_int (loop);
 705   if (est_niter == -1)
 706     est_niter = get_likely_max_loop_iterations_int (loop);
 707
 708   if (est_niter >= 0 && est_niter < 3)
 709     {
 710       if (dump_file)
 711         fprintf (dump_file,
 712                  "Doloop: Too few iterations (%u) to be profitable.\n",
 713                  (unsigned int)est_niter);
 714       return false;
 715     }
 716
 717   if (desc->const_iter)
 718     iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
 719                                    UNSIGNED);
 720   else
 721     iterations = 0;
 722   if (!get_max_loop_iterations (loop, &iterations_max))
 723     iterations_max = 0;
 724   level = get_loop_level (loop) + 1;
 725   entered_at_top = (loop->latch == desc->in_edge->dest
 726                     && contains_no_active_insn_p (loop->latch));
 727   if (!targetm.can_use_doloop_p (iterations, iterations_max, level,
 728                                  entered_at_top))
 729     {
 730       if (dump_file)
 731         fprintf (dump_file, "Loop rejected by can_use_doloop_p.\n");
 732       return false;
 733     }
 734
 735   /* Generate looping insn.  If the pattern FAILs then give up trying
 736      to modify the loop since there is some aspect the back-end does
 737      not like.  If this succeeds, there is a chance that the loop
 738      desc->niter_expr has been altered by the backend, so only extract
 739      that data after the gen_doloop_end.  */
 740   start_label = block_label (desc->in_edge->dest);
 741   doloop_reg = gen_reg_rtx (mode);
 742   rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
 743
 744   max_cost
 745     = COSTS_N_INSNS (param_max_iterations_computation_cost);
 746   if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
 747       > max_cost)
 748     {
 749       if (dump_file)
 750         fprintf (dump_file,
 751                  "Doloop: number of iterations too costly to compute.\n");
 752       return false;
 753     }
 754
 755   count = copy_rtx (desc->niter_expr);
 756   word_mode_size = GET_MODE_PRECISION (word_mode);
 757   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
 758   if (! doloop_seq
 759       && mode != word_mode
 760       /* Before trying mode different from the one in that # of iterations is
 761          computed, we must be sure that the number of iterations fits into
 762          the new mode.  */
 763       && (word_mode_size >= GET_MODE_PRECISION (mode)
 764           || wi::leu_p (iterations_max, word_mode_max)))
 765     {
 766       if (word_mode_size > GET_MODE_PRECISION (mode))
 767         count = simplify_gen_unary (ZERO_EXTEND, word_mode, count, mode);
 768       else
 769         count = lowpart_subreg (word_mode, count, mode);
 770       PUT_MODE (doloop_reg, word_mode);
 771       doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
 772     }
 773   if (! doloop_seq)
 774     {
 775       if (dump_file)
 776         fprintf (dump_file,
 777                  "Doloop: Target unwilling to use doloop pattern!\n");
 778       return false;
 779     }
 780
 781   /* If multiple instructions were created, the last must be the
 782      jump instruction.  */
 783   rtx_insn *doloop_insn = doloop_seq;
 784   while (NEXT_INSN (doloop_insn) != NULL_RTX)
 785     doloop_insn = NEXT_INSN (doloop_insn);
 786   if (!JUMP_P (doloop_insn)
 787       || !(condition = doloop_condition_get (doloop_insn)))
 788     {
 789       if (dump_file)
 790         fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
 791       return false;
 792     }
 793
 794   /* Ensure that the new sequence doesn't clobber a register that
 795      is live at the end of the block.  */
 796   {
 797     bitmap modified = BITMAP_ALLOC (NULL);
 798
 799     for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
 800       note_stores (i, record_reg_sets, modified);
 801
 802     basic_block loop_end = desc->out_edge->src;
 803     bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
 804     /* iv_analysis_loop_init calls df_analyze_loop, which computes just
 805        partial df for blocks of the loop only.  The above will catch if
 806        any of the modified registers are use inside of the loop body, but
 807        it will most likely not have accurate info on registers used
 808        at the destination of the out_edge.  We call df_analyze on the
 809        whole function at the start of the pass though and iterate only
 810        on innermost loops or from innermost loops, so
 811        live in on desc->out_edge->dest should be still unmodified from
 812        the initial df_analyze.  */
 813     if (!fail)
 814       fail = bitmap_intersect_p (df_get_live_in (desc->out_edge->dest),
 815                                  modified);
 816     BITMAP_FREE (modified);
 817
 818     if (fail)
 819       {
 820         if (dump_file)
 821           fprintf (dump_file, "Doloop: doloop pattern clobbers live out\n");
 822         return false;
 823       }
 824   }
 825
 826   doloop_modify (loop, desc, doloop_seq, condition, count);
 827   return true;
 828 }
 829
 830 /* This is the main entry point.  Process all loops using doloop_optimize.  */
 831
 832 void
 833 doloop_optimize_loops (void)
 834 {
 835   if (optimize == 1)
 836     {
 837       df_live_add_problem ();
 838       df_live_set_all_dirty ();
 839     }
 840
 841   df_analyze ();
 842
 843   for (auto loop : loops_list (cfun,
 844                                targetm.can_use_doloop_p
 845                                == can_use_doloop_if_innermost
 846                                ? LI_ONLY_INNERMOST : LI_FROM_INNERMOST))
 847     doloop_optimize (loop);
 848
 849   if (optimize == 1)
 850     df_remove_problem (df_live);
 851
 852   iv_analysis_done ();
 853
 854   checking_verify_loop_structure ();
 855 }