gcc/config/aarch64/aarch64-cc-fusion.cc

   1 // Pass to fuse CC operations with other instructions.
   2 // Copyright (C) 2021-2025 Free Software Foundation, Inc.
   3 //
   4 // This file is part of GCC.
   5 //
   6 // GCC is free software; you can redistribute it and/or modify it under
   7 // the terms of the GNU General Public License as published by the Free
   8 // Software Foundation; either version 3, or (at your option) any later
   9 // version.
  10 //
  11 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 // for more details.
  15 //
  16 // You should have received a copy of the GNU General Public License
  17 // along with GCC; see the file COPYING3.  If not see
  18 // <http://www.gnu.org/licenses/>.
  19
  20 // This pass looks for sequences of the form:
  21 //
  22 //    A: (set (reg R1) X1)
  23 //    B: ...instructions that might change the value of X1...
  24 //    C: (set (reg CC) X2) // X2 uses R1
  25 //
  26 // and tries to change them to:
  27 //
  28 //    C': [(set (reg CC) X2')
  29 //         (set (reg R1) X1)]
  30 //    B: ...instructions that might change the value of X1...
  31 //
  32 // where X2' is the result of replacing R1 with X1 in X2.
  33 //
  34 // This sequence occurs in SVE code in two important cases:
  35 //
  36 // (a) Sometimes, to deal correctly with overflow, we need to increment
  37 //     an IV after a WHILELO rather than before it.  In this case:
  38 //     - A is a WHILELO,
  39 //     - B includes an IV increment and
  40 //     - C is a separate PTEST.
  41 //
  42 // (b) ACLE code of the form:
  43 //
  44 //       svbool_t ok = svrdffr ();
  45 //       if (svptest_last (pg, ok))
  46 //         ...
  47 //
  48 //     must, for performance reasons, be code-generated as:
  49 //
  50 //       RDFFRS Pok.B, Pg/Z
  51 //       ...branch on flags result...
  52 //
  53 //     without a separate PTEST of Pok.  In this case:
  54 //     - A is an aarch64_rdffr
  55 //     - B includes an aarch64_update_ffrt
  56 //     - C is a separate PTEST
  57 //
  58 // Combine can handle this optimization if B doesn't exist and if A and
  59 // C are in the same BB.  This pass instead handles cases where B does
  60 // exist and cases where A and C are in different BBs of the same EBB.
  61
  62 #define IN_TARGET_CODE 1
  63
  64 #define INCLUDE_ALGORITHM
  65 #define INCLUDE_FUNCTIONAL
  66 #define INCLUDE_ARRAY
  67 #include "config.h"
  68 #include "system.h"
  69 #include "coretypes.h"
  70 #include "backend.h"
  71 #include "rtl.h"
  72 #include "df.h"
  73 #include "rtl-ssa.h"
  74 #include "tree-pass.h"
  75
  76 using namespace rtl_ssa;
  77
  78 namespace {
  79 const pass_data pass_data_cc_fusion =
  80 {
  81   RTL_PASS, // type
  82   "cc_fusion", // name
  83   OPTGROUP_NONE, // optinfo_flags
  84   TV_NONE, // tv_id
  85   0, // properties_required
  86   0, // properties_provided
  87   0, // properties_destroyed
  88   0, // todo_flags_start
  89   TODO_df_finish, // todo_flags_finish
  90 };
  91
  92 // Class that represents one run of the pass.
  93 class cc_fusion
  94 {
  95 public:
  96   cc_fusion ()  : m_parallel () {}
  97   void execute ();
  98
  99 private:
 100   rtx optimizable_set (const insn_info *);
 101   bool parallelize_insns (def_info *, rtx, def_info *, rtx);
 102   void optimize_cc_setter (def_info *, rtx);
 103
 104   // A spare PARALLEL rtx, or null if none.
 105   rtx m_parallel;
 106 };
 107
 108 // See whether INSN is a single_set that we can optimize.  Return the
 109 // set if so, otherwise return null.
 110 rtx
 111 cc_fusion::optimizable_set (const insn_info *insn)
 112 {
 113   if (!insn->can_be_optimized ()
 114       || insn->is_asm ()
 115       || insn->has_volatile_refs ()
 116       || insn->has_pre_post_modify ())
 117     return NULL_RTX;
 118
 119   return single_set (insn->rtl ());
 120 }
 121
 122 // CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
 123 // a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
 124 // CC register and the instruction that contains CC_SET is known to use
 125 // OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
 126 bool
 127 cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
 128                               def_info *other_def, rtx other_set)
 129 {
 130   auto attempt = crtl->ssa->new_change_attempt ();
 131
 132   insn_info *cc_insn = cc_def->insn ();
 133   insn_info *other_insn = other_def->insn ();
 134   if (dump_file && (dump_flags & TDF_DETAILS))
 135     fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
 136              other_insn->uid (), cc_insn->uid ());
 137
 138   // Try to substitute OTHER_SET into CC_INSN.
 139   insn_change_watermark rtl_watermark;
 140   rtx_insn *cc_rtl = cc_insn->rtl ();
 141   insn_propagation prop (cc_rtl, SET_DEST (other_set),
 142                          SET_SRC (other_set));
 143   if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
 144       || prop.num_replacements == 0)
 145     {
 146       if (dump_file && (dump_flags & TDF_DETAILS))
 147         fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
 148                  other_def->regno ());
 149       return false;
 150     }
 151
 152   // Restrict the uses to those outside notes.
 153   use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
 154   use_array other_set_uses = remove_note_accesses (attempt,
 155                                                    other_insn->uses ());
 156
 157   // Remove the use of the substituted value.
 158   access_array_builder uses_builder (attempt);
 159   uses_builder.reserve (cc_uses.size ());
 160   for (use_info *use : cc_uses)
 161     if (use->def () != other_def)
 162       uses_builder.quick_push (use);
 163   cc_uses = use_array (uses_builder.finish ());
 164
 165   // Get the list of uses for the new instruction.
 166   insn_change cc_change (cc_insn);
 167   cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
 168   if (!cc_change.new_uses.is_valid ())
 169     {
 170       if (dump_file && (dump_flags & TDF_DETAILS))
 171         fprintf (dump_file, "-- cannot merge uses\n");
 172       return false;
 173     }
 174
 175   // The instruction initially defines just two registers.  recog can add
 176   // extra clobbers if necessary.
 177   auto_vec<access_info *, 2> new_defs;
 178   new_defs.quick_push (cc_def);
 179   new_defs.quick_push (other_def);
 180   sort_accesses (new_defs);
 181   cc_change.new_defs = def_array (access_array (new_defs));
 182
 183   // Make sure there is somewhere that the new instruction could live.
 184   auto other_change = insn_change::delete_insn (other_insn);
 185   insn_change *changes[] = { &other_change, &cc_change };
 186   cc_change.move_range = cc_insn->ebb ()->insn_range ();
 187   if (!restrict_movement (cc_change, ignore_changing_insns (changes)))
 188     {
 189       if (dump_file && (dump_flags & TDF_DETAILS))
 190         fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
 191       return false;
 192     }
 193
 194   // Tentatively install the new pattern.  By convention, the CC set
 195   // must be first.
 196   if (m_parallel)
 197     {
 198       XVECEXP (m_parallel, 0, 0) = cc_set;
 199       XVECEXP (m_parallel, 0, 1) = other_set;
 200     }
 201   else
 202     {
 203       rtvec vec = gen_rtvec (2, cc_set, other_set);
 204       m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
 205     }
 206   validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
 207
 208   // These routines report failures themselves.
 209   if (!recog (attempt, cc_change, ignore_changing_insns (changes))
 210       || !changes_are_worthwhile (changes)
 211       || !crtl->ssa->verify_insn_changes (changes))
 212     return false;
 213
 214   remove_reg_equal_equiv_notes (cc_rtl);
 215   confirm_change_group ();
 216   crtl->ssa->change_insns (changes);
 217   m_parallel = NULL_RTX;
 218   return true;
 219 }
 220
 221 // Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
 222 // a definition of the CC register by CC_SET.
 223 void
 224 cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
 225 {
 226   // Search the registers used by the CC setter for an easily-substitutable
 227   // def-use chain.
 228   for (use_info *other_use : cc_def->insn ()->uses ())
 229     if (def_info *other_def = other_use->def ())
 230       if (other_use->regno () != CC_REGNUM
 231           && other_def->ebb () == cc_def->ebb ())
 232         if (rtx other_set = optimizable_set (other_def->insn ()))
 233           {
 234             rtx dest = SET_DEST (other_set);
 235             if (REG_P (dest)
 236                 && REGNO (dest) == other_def->regno ()
 237                 && REG_NREGS (dest) == 1
 238                 && parallelize_insns (cc_def, cc_set, other_def, other_set))
 239               return;
 240           }
 241 }
 242
 243 // Run the pass on the current function.
 244 void
 245 cc_fusion::execute ()
 246 {
 247   // Initialization.
 248   calculate_dominance_info (CDI_DOMINATORS);
 249   df_analyze ();
 250   crtl->ssa = new rtl_ssa::function_info (cfun);
 251
 252   // Walk through all instructions that set CC.  Look for a PTEST instruction
 253   // that we can optimize.
 254   //
 255   // ??? The PTEST test isn't needed for correctness, but it ensures that the
 256   // pass no effect on non-SVE code.
 257   for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
 258     if (rtx cc_set = optimizable_set (def->insn ()))
 259       if (REG_P (SET_DEST (cc_set))
 260           && REGNO (SET_DEST (cc_set)) == CC_REGNUM
 261           && GET_CODE (SET_SRC (cc_set)) == UNSPEC
 262           && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
 263         optimize_cc_setter (def, cc_set);
 264
 265   // Finalization.
 266   crtl->ssa->perform_pending_updates ();
 267   free_dominance_info (CDI_DOMINATORS);
 268 }
 269
 270 class pass_cc_fusion : public rtl_opt_pass
 271 {
 272 public:
 273   pass_cc_fusion (gcc::context *ctxt)
 274     : rtl_opt_pass (pass_data_cc_fusion, ctxt)
 275   {}
 276
 277   // opt_pass methods:
 278   virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
 279   virtual unsigned int execute (function *);
 280 };
 281
 282 unsigned int
 283 pass_cc_fusion::execute (function *)
 284 {
 285   cc_fusion ().execute ();
 286   return 0;
 287 }
 288
 289 } // end namespace
 290
 291 // Create a new CC fusion pass instance.
 292
 293 rtl_opt_pass *
 294 make_pass_cc_fusion (gcc::context *ctxt)
 295 {
 296   return new pass_cc_fusion (ctxt);
 297 }