gcc/config/riscv/riscv-avlprop.cc

   1 /* AVL propagation pass for RISC-V 'V' Extension for GNU compiler.
   2    Copyright (C) 2023-2025 Free Software Foundation, Inc.
   3    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3, or(at your option)
  10 any later version.
  11
  12 GCC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Pre-RA RTL_SSA-based pass propagates AVL for RVV instructions.
  22    A standalone AVL propagation pass is designed because:
  23
  24      - Better code maintain:
  25        Current LCM-based VSETVL pass is so complicated that codes
  26        there will become even harder to maintain. A straight forward
  27        AVL propagation PASS is much easier to maintain.
  28
  29      - Reduce scalar register pressure:
  30        A type of AVL propagation is we propagate AVL from NON-VLMAX
  31        instruction to VLMAX instruction.
  32        Note: VLMAX instruction should be ignore tail elements (TA)
  33        and the result should be used by the NON-VLMAX instruction.
  34        This optimization is mostly for auto-vectorization codes:
  35
  36           vsetvli r136, r137      --- SELECT_VL
  37           vle8.v (use avl = r136) --- IFN_MASK_LEN_LOAD
  38           vadd.vv (use VLMAX)     --- PLUS_EXPR
  39           vse8.v (use avl = r136) --- IFN_MASK_LEN_STORE
  40
  41         NO AVL propagation:
  42
  43           vsetvli a5, a4, ta
  44           vle8.v v1
  45           vsetvli t0, zero, ta
  46           vadd.vv v2, v1, v1
  47           vse8.v v2
  48
  49         We can propagate the AVL to 'vadd.vv' since its result
  50         is consumed by a 'vse8.v' which has AVL = a5 and its
  51         tail elements are agnostic.
  52
  53        We DON'T do this optimization on VSETVL pass since it is a
  54        post-RA pass that consumed 't0' already wheras a standalone
  55        pre-RA AVL propagation pass allows us elide the consumption
  56        of the pseudo register of 't0' then we can reduce scalar
  57        register pressure.
  58
  59      - More AVL propagation opportunities:
  60        A pre-RA pass is more flexible for AVL REG def-use chain,
  61        thus we will get more potential AVL propagation as long as
  62        it doesn't increase the scalar register pressure.
  63 */
  64
  65 #define IN_TARGET_CODE 1
  66 #define INCLUDE_ALGORITHM
  67 #define INCLUDE_FUNCTIONAL
  68 #define INCLUDE_ARRAY
  69
  70 #include "config.h"
  71 #include "system.h"
  72 #include "coretypes.h"
  73 #include "tm.h"
  74 #include "backend.h"
  75 #include "rtl.h"
  76 #include "target.h"
  77 #include "tree-pass.h"
  78 #include "df.h"
  79 #include "rtl-ssa.h"
  80 #include "cfgcleanup.h"
  81 #include "insn-attr.h"
  82 #include "tm-constrs.h"
  83 #include "insn-opinit.h"
  84
  85 using namespace rtl_ssa;
  86 using namespace riscv_vector;
  87
  88 enum avlprop_type
  89 {
  90   /* VLMAX AVL and tail agnostic candidates.  */
  91   AVLPROP_VLMAX_TA,
  92   AVLPROP_NONE
  93 };
  94
  95 /* dump helper functions */
  96 static const char *
  97 avlprop_type_to_str (enum avlprop_type type)
  98 {
  99   switch (type)
 100     {
 101     case AVLPROP_VLMAX_TA:
 102       return "vlmax_ta";
 103
 104     default:
 105       gcc_unreachable ();
 106     }
 107 }
 108
 109 /* Return true if the AVL of the INSN can be propagated.  */
 110 static bool
 111 avl_can_be_propagated_p (rtx_insn *rinsn)
 112 {
 113   /* We can't do AVL propagation when the instruction is potentially
 114      touching the element with i > AVL.  So, we don't do AVL propagation
 115      on these following situations:
 116
 117        vgather:
 118          - The index of "vrgather dest, source, index" may pick up the
 119            element which has index >= AVL, so we can't strip the elements
 120            that has index >= AVL of source register.
 121        vslide1down:
 122          - The last element of vslide1down is AVL + 1 according to RVV ISA:
 123            vstart <= i < vl-1    vd[i] = vs2[i+1] if v0.mask[i] enabled
 124          - The last multiple elements of vslidedown can be the element
 125            has index >= AVL according to RVV ISA:
 126            0 <= i+OFFSET < VLMAX   src[i] = vs2[i+OFFSET]
 127            vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled.
 128        vcompress:
 129          - According to the ISA, the first vl elements of vector register
 130            group vs2 should be extracted and packed for vcompress.  And the
 131            highest element of vs2 vector may be touched by the mask.  For
 132            example, given vlmax = 4 here.
 133            v0 = 0b1000
 134            v1 = {0x1, 0x2, 0x3, 0x4}
 135            v2 = {0x5, 0x6, 0x7, 0x8}
 136            vcompress v1, v2, v0 with avl = 4, v1 = {0x8, 0x2, 0x3, 0x4}.
 137            vcompress v1, v2, v0 with avl = 2, v1 will be unchanged.
 138            Thus, we cannot propagate avl of vcompress because it may has
 139            semantics change to the result.  */
 140   return get_attr_type (rinsn) != TYPE_VGATHER
 141          && get_attr_type (rinsn) != TYPE_VSLIDEDOWN
 142          && get_attr_type (rinsn) != TYPE_VISLIDE1DOWN
 143          && get_attr_type (rinsn) != TYPE_VFSLIDE1DOWN
 144          && get_attr_type (rinsn) != TYPE_VCOMPRESS;
 145 }
 146
 147 static bool
 148 vlmax_ta_p (rtx_insn *rinsn)
 149 {
 150   return vlmax_avl_type_p (rinsn) && tail_agnostic_p (rinsn);
 151 }
 152
 153 static machine_mode
 154 get_insn_vtype_mode (rtx_insn *rinsn)
 155 {
 156   extract_insn_cached (rinsn);
 157   int mode_idx = get_attr_mode_idx (rinsn);
 158   gcc_assert (mode_idx != INVALID_ATTRIBUTE);
 159   return GET_MODE (recog_data.operand[mode_idx]);
 160 }
 161
 162 /* Return new pattern for AVL propagation.
 163    Normally, we just replace AVL operand only for most
 164    of the instructions.  However, for instructions like
 165    fault load which use AVL TYPE twice in the pattern which
 166    will cause ICE in the later AVL TYPE change so we regenerate
 167    the whole pattern for such instructions.  */
 168 static rtx
 169 simplify_replace_avl (rtx_insn *rinsn, rtx new_avl)
 170 {
 171   /* Replace AVL operand.  */
 172   extract_insn_cached (rinsn);
 173   rtx avl = recog_data.operand[get_attr_vl_op_idx (rinsn)];
 174   int count = count_regno_occurrences (rinsn, REGNO (avl));
 175   gcc_assert (count == 1);
 176   rtx new_pat = simplify_replace_rtx (PATTERN (rinsn), avl, new_avl);
 177   if (get_attr_type (rinsn) == TYPE_VLDFF
 178       || get_attr_type (rinsn) == TYPE_VLSEGDFF)
 179     new_pat
 180       = gen_pred_fault_load (recog_data.operand_mode[0], recog_data.operand[0],
 181                              recog_data.operand[1], recog_data.operand[2],
 182                              recog_data.operand[3], new_avl,
 183                              recog_data.operand[5], recog_data.operand[6],
 184                              get_avl_type_rtx (avl_type::NONVLMAX));
 185   else
 186     new_pat = simplify_replace_rtx (PATTERN (rinsn), avl, new_avl);
 187   return new_pat;
 188 }
 189
 190 static void
 191 simplify_replace_vlmax_avl (rtx_insn *rinsn, rtx new_avl)
 192 {
 193   if (dump_file && (dump_flags & TDF_DETAILS))
 194     {
 195       fprintf (dump_file, "\nPropagating AVL: ");
 196       print_rtl_single (dump_file, new_avl);
 197       fprintf (dump_file, "into: ");
 198       print_rtl_single (dump_file, rinsn);
 199     }
 200   rtx new_pat = simplify_replace_avl (rinsn, new_avl);
 201   validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, false);
 202
 203   /* Change AVL TYPE into NONVLMAX if it is VLMAX.  */
 204   if (vlmax_avl_type_p (rinsn))
 205     {
 206       int index = get_attr_avl_type_idx (rinsn);
 207       gcc_assert (index != INVALID_ATTRIBUTE);
 208       validate_change_or_fail (rinsn, recog_data.operand_loc[index],
 209                                get_avl_type_rtx (avl_type::NONVLMAX), false);
 210     }
 211   if (dump_file && (dump_flags & TDF_DETAILS))
 212     {
 213       fprintf (dump_file, "Successfully to match this instruction: ");
 214       print_rtl_single (dump_file, rinsn);
 215     }
 216 }
 217
 218 const pass_data pass_data_avlprop = {
 219   RTL_PASS,      /* type */
 220   "avlprop",     /* name */
 221   OPTGROUP_NONE, /* optinfo_flags */
 222   TV_NONE,       /* tv_id */
 223   0,             /* properties_required */
 224   0,             /* properties_provided */
 225   0,             /* properties_destroyed */
 226   0,             /* todo_flags_start */
 227   0,             /* todo_flags_finish */
 228 };
 229
 230 class pass_avlprop : public rtl_opt_pass
 231 {
 232 public:
 233   pass_avlprop (gcc::context *ctxt) : rtl_opt_pass (pass_data_avlprop, ctxt) {}
 234
 235   /* opt_pass methods: */
 236   virtual bool gate (function *) final override
 237   {
 238     return TARGET_VECTOR && optimize > 0;
 239   }
 240   virtual unsigned int execute (function *) final override;
 241
 242 private:
 243   /* The AVL propagation instructions and corresponding preferred AVL.
 244      It will be updated during the analysis.  */
 245   hash_map<insn_info *, rtx> *m_avl_propagations;
 246
 247   /* Potential feasible AVL propagation candidates.  */
 248   auto_vec<std::pair<enum avlprop_type, insn_info *>> m_candidates;
 249
 250   rtx get_preferred_avl (const std::pair<enum avlprop_type, insn_info *>) const;
 251   rtx get_vlmax_ta_preferred_avl (insn_info *) const;
 252   rtx get_nonvlmax_avl (insn_info *) const;
 253
 254   void avlprop_init (function *);
 255   void avlprop_done (void);
 256 }; // class pass_avlprop
 257
 258 void
 259 pass_avlprop::avlprop_init (function *fn)
 260 {
 261   calculate_dominance_info (CDI_DOMINATORS);
 262   df_analyze ();
 263   crtl->ssa = new function_info (fn);
 264   m_avl_propagations = new hash_map<insn_info *, rtx>;
 265 }
 266
 267 void
 268 pass_avlprop::avlprop_done (void)
 269 {
 270   free_dominance_info (CDI_DOMINATORS);
 271   if (crtl->ssa->perform_pending_updates ())
 272     cleanup_cfg (0);
 273   delete crtl->ssa;
 274   crtl->ssa = nullptr;
 275   delete m_avl_propagations;
 276   m_avl_propagations = NULL;
 277   if (!m_candidates.is_empty ())
 278     m_candidates.release ();
 279 }
 280
 281 /* If we have a preferred AVL to propagate, return the AVL.
 282    Otherwise, return NULL_RTX as we don't need have any preferred
 283    AVL.  */
 284
 285 rtx
 286 pass_avlprop::get_preferred_avl (
 287   const std::pair<enum avlprop_type, insn_info *> candidate) const
 288 {
 289   switch (candidate.first)
 290     {
 291     case AVLPROP_VLMAX_TA:
 292       return get_vlmax_ta_preferred_avl (candidate.second);
 293     default:
 294       gcc_unreachable ();
 295     }
 296   return NULL_RTX;
 297 }
 298
 299 /* This is a straight forward pattern ALWAYS in partial auto-vectorization:
 300
 301      VL = SELECT_AVL (AVL, ...)
 302      V0 = MASK_LEN_LOAD (..., VL)
 303      V1 = MASK_LEN_LOAD (..., VL)
 304      V2 = V0 + V1 --- Missed LEN information.
 305      MASK_LEN_STORE (..., V2, VL)
 306
 307    We prefer PLUS_EXPR (V0 + V1) instead of COND_LEN_ADD (V0, V1, dummy LEN)
 308    because:
 309
 310      - Few code changes in Loop Vectorizer.
 311      - Reuse the current clean flow of partial vectorization, That is, apply
 312        predicate LEN or MASK into LOAD/STORE operations and other special
 313        arithmetic operations (e.d. DIV), then do the whole vector register
 314        operation if it DON'T affect the correctness.
 315        Such flow is used by all other targets like x86, sve, s390, ... etc.
 316      - PLUS_EXPR has better gimple optimizations than COND_LEN_ADD.
 317
 318    We propagate AVL from NON-VLMAX to VLMAX for gimple IR like PLUS_EXPR which
 319    generates the VLMAX instruction due to missed LEN information. The later
 320    VSETVL PASS will elided the redundant vsetvls.
 321 */
 322
 323 rtx
 324 pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
 325 {
 326   if (!avl_can_be_propagated_p (insn->rtl ()))
 327     return NULL_RTX;
 328   int sew = get_sew (insn->rtl ());
 329   enum vlmul_type vlmul = get_vlmul (insn->rtl ());
 330   int ratio = calculate_ratio (sew, vlmul);
 331
 332   rtx use_avl = NULL_RTX;
 333   for (def_info *def : insn->defs ())
 334     {
 335       if (!is_a<set_info *> (def) || def->is_mem ())
 336         return NULL_RTX;
 337       const auto *set = dyn_cast<set_info *> (def);
 338
 339       /* FIXME: Stop AVL propagation if any USE is not a RVV real
 340          instruction. It should be totally enough for vectorized codes since
 341          they always locate at extended blocks.
 342
 343          TODO: We can extend PHI checking for intrinsic codes if it
 344          necessary in the future.  */
 345       if (!set->is_local_to_ebb ())
 346         return NULL_RTX;
 347
 348       for (use_info *use : set->nondebug_insn_uses ())
 349         {
 350           insn_info *use_insn = use->insn ();
 351           if (!use_insn->can_be_optimized () || use_insn->is_asm ()
 352               || use_insn->is_call () || use_insn->has_volatile_refs ()
 353               || use_insn->has_pre_post_modify ()
 354               || !has_vl_op (use_insn->rtl ())
 355               || !avl_can_be_propagated_p (use_insn->rtl ()))
 356             return NULL_RTX;
 357
 358           /* We should only propagate non-VLMAX AVL into VLMAX insn when
 359              such insn potential tail elements (after propagation) are
 360              not used.  So, we should make sure the outcome of VLMAX insn
 361              is not depend on.  */
 362           extract_insn_cached (use_insn->rtl ());
 363           int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ());
 364           if (merge_op_idx != INVALID_ATTRIBUTE
 365               && !satisfies_constraint_vu (recog_data.operand[merge_op_idx])
 366               && refers_to_regno_p (set->regno (),
 367                                     recog_data.operand[merge_op_idx])
 368               && !tail_agnostic_p (use_insn->rtl ()))
 369             return NULL_RTX;
 370
 371           int new_sew = get_sew (use_insn->rtl ());
 372           enum vlmul_type new_vlmul = get_vlmul (use_insn->rtl ());
 373           int new_ratio = calculate_ratio (new_sew, new_vlmul);
 374           if (new_ratio != ratio)
 375             return NULL_RTX;
 376
 377           rtx new_use_avl = get_nonvlmax_avl (use_insn);
 378           if (!new_use_avl || SUBREG_P (new_use_avl))
 379             return NULL_RTX;
 380           if (REG_P (new_use_avl))
 381             {
 382               resource_info resource = full_register (REGNO (new_use_avl));
 383               def_lookup dl = crtl->ssa->find_def (resource, use_insn);
 384               if (dl.matching_set ())
 385                 return NULL_RTX;
 386               def_info *def1 = dl.prev_def (insn);
 387               def_info *def2 = dl.prev_def (use_insn);
 388               if (!def1 || !def2 || def1 != def2)
 389                 return NULL_RTX;
 390               /* For vectorized codes, we always use SELECT_VL/MIN_EXPR to
 391                  calculate the loop len at the header of the loop.
 392                  We only allow AVL propagation for real instruction for now.
 393                  TODO: We may enhance it for intrinsic codes if it is necessary.
 394               */
 395               if (!def1->insn ()->is_real ())
 396                 return NULL_RTX;
 397
 398               /* FIXME: We only all AVL propagation within a block which should
 399                  be totally enough for vectorized codes.
 400
 401                  TODO: We can enhance it here for intrinsic codes in the future
 402                  if it is necessary.  */
 403               if (def1->insn ()->bb () != insn->bb ()
 404                   && !dominated_by_p (CDI_DOMINATORS, insn->bb ()->cfg_bb (),
 405                                       def1->insn ()->bb ()->cfg_bb ()))
 406                 return NULL_RTX;
 407               if (def1->insn ()->bb () == insn->bb ()
 408                   && def1->insn ()->compare_with (insn) >= 0)
 409                 return NULL_RTX;
 410             }
 411
 412           if (!use_avl)
 413             use_avl = new_use_avl;
 414           else if (!rtx_equal_p (use_avl, new_use_avl))
 415             return NULL_RTX;
 416         }
 417     }
 418
 419   return use_avl;
 420 }
 421
 422 /* Try to get the NONVLMAX AVL of the INSN.
 423    INSN can be either NON-VLMAX AVL itself or VLMAX AVL INSN
 424    before the PASS but has been propagated a NON-VLMAX AVL
 425    in the before round propagation.  */
 426 rtx
 427 pass_avlprop::get_nonvlmax_avl (insn_info *insn) const
 428 {
 429   if (m_avl_propagations->get (insn))
 430     return (*m_avl_propagations->get (insn));
 431   else if (nonvlmax_avl_type_p (insn->rtl ()))
 432     {
 433       extract_insn_cached (insn->rtl ());
 434       return recog_data.operand[get_attr_vl_op_idx (insn->rtl ())];
 435     }
 436
 437   return NULL_RTX;
 438 }
 439
 440 /* Main entry point for this pass.  */
 441 unsigned int
 442 pass_avlprop::execute (function *fn)
 443 {
 444   avlprop_init (fn);
 445
 446   /* Iterate the whole function in reverse order (which could speed the
 447      convergence) to collect all potential candidates that could be AVL
 448      propagated.
 449
 450      Note that: **NOT** all the candidates will be successfully AVL propagated.
 451   */
 452   for (bb_info *bb : crtl->ssa->reverse_bbs ())
 453     {
 454       for (insn_info *insn : bb->reverse_real_nondebug_insns ())
 455         {
 456           /* We only forward AVL to the instruction that has AVL/VL operand
 457              and can be optimized in RTL_SSA level.  */
 458           if (!insn->can_be_optimized () || !has_vl_op (insn->rtl ()))
 459             continue;
 460
 461           /* TODO: We only do AVL propagation for VLMAX AVL with tail
 462              agnostic policy since we have missed-LEN information partial
 463              autovectorization.  We could add more AVL propagation
 464              for intrinsic codes in the future.  */
 465           if (vlmax_ta_p (insn->rtl ()))
 466             m_candidates.safe_push (std::make_pair (AVLPROP_VLMAX_TA, insn));
 467         }
 468     }
 469
 470   if (dump_file && (dump_flags & TDF_DETAILS))
 471     {
 472       fprintf (dump_file, "\nNumber of potential AVL propagations: %d\n",
 473                m_candidates.length ());
 474       for (const auto &candidate : m_candidates)
 475         {
 476           fprintf (dump_file, "\nAVL propagation type: %s\n",
 477                    avlprop_type_to_str (candidate.first));
 478           print_rtl_single (dump_file, candidate.second->rtl ());
 479         }
 480     }
 481
 482   /* Go through all the candidates looking for AVL that we could propagate. */
 483   bool change_p = true;
 484   while (change_p)
 485     {
 486       change_p = false;
 487       for (auto &candidate : m_candidates)
 488         {
 489           rtx new_avl = get_preferred_avl (candidate);
 490           if (new_avl)
 491             {
 492               gcc_assert (!vlmax_avl_p (new_avl));
 493               auto &update
 494                 = m_avl_propagations->get_or_insert (candidate.second);
 495               change_p = !rtx_equal_p (update, new_avl);
 496               update = new_avl;
 497             }
 498         }
 499     }
 500
 501   if (dump_file && (dump_flags & TDF_DETAILS))
 502     fprintf (dump_file, "\nNumber of successful AVL propagations: %d\n\n",
 503              (int) m_avl_propagations->elements ());
 504
 505   for (const auto prop : *m_avl_propagations)
 506     {
 507       rtx_insn *rinsn = prop.first->rtl ();
 508       simplify_replace_vlmax_avl (rinsn, prop.second);
 509     }
 510
 511   if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
 512     {
 513       /* Simplify VLMAX AVL into immediate AVL.
 514          E.g. Simplify this following case:
 515
 516               vsetvl a5, zero, e32, m1
 517               vadd.vv
 518
 519             into:
 520
 521               vsetvl zero, 4, e32, m1
 522               vadd.vv
 523          if GET_MODE_NUNITS (RVVM1SImode) == 4.  */
 524       if (dump_file && (dump_flags & TDF_DETAILS))
 525         fprintf (dump_file, "\nSimplifying VLMAX AVL into IMM AVL\n\n");
 526       for (auto &candidate : m_candidates)
 527         {
 528           rtx_insn *rinsn = candidate.second->rtl ();
 529           machine_mode vtype_mode = get_insn_vtype_mode (rinsn);
 530           if (candidate.first == AVLPROP_VLMAX_TA
 531               && !m_avl_propagations->get (candidate.second)
 532               && imm_avl_p (vtype_mode))
 533             {
 534               rtx new_avl = gen_int_mode (GET_MODE_NUNITS (vtype_mode), Pmode);
 535               simplify_replace_vlmax_avl (rinsn, new_avl);
 536             }
 537         }
 538     }
 539
 540   avlprop_done ();
 541   return 0;
 542 }
 543
 544 rtl_opt_pass *
 545 make_pass_avlprop (gcc::context *ctxt)
 546 {
 547   return new pass_avlprop (ctxt);
 548 }