gcc/config/riscv/riscv-v.cc

   1 /* Subroutines used for code generation for RISC-V 'V' Extension for
   2    GNU compiler.
   3    Copyright (C) 2022-2025 Free Software Foundation, Inc.
   4    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   5
   6    This file is part of GCC.
   7
   8    GCC is free software; you can redistribute it and/or modify it
   9    under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GCC is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GCC; see the file COPYING3.  If not see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 #define IN_TARGET_CODE 1
  23
  24 /* We have a maximum of 11 operands for RVV instruction patterns according to
  25    the vector.md.  */
  26 #define RVV_INSN_OPERANDS_MAX 11
  27
  28 #include "config.h"
  29 #include "system.h"
  30 #include "coretypes.h"
  31 #include "tm.h"
  32 #include "backend.h"
  33 #include "rtl.h"
  34 #include "insn-config.h"
  35 #include "insn-attr.h"
  36 #include "recog.h"
  37 #include "alias.h"
  38 #include "tree.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "explow.h"
  42 #include "memmodel.h"
  43 #include "emit-rtl.h"
  44 #include "tm_p.h"
  45 #include "target.h"
  46 #include "targhooks.h"
  47 #include "expr.h"
  48 #include "optabs.h"
  49 #include "tm-constrs.h"
  50 #include "rtx-vector-builder.h"
  51 #include "targhooks.h"
  52 #include "predict.h"
  53 #include "errors.h"
  54 #include "riscv-v.h"
  55
  56 using namespace riscv_vector;
  57
  58 namespace riscv_vector {
  59
  60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli.  */
  61 bool
  62 imm_avl_p (machine_mode mode)
  63 {
  64   poly_uint64 nunits = GET_MODE_NUNITS (mode);
  65
  66   return nunits.is_constant ()
  67            /* The vsetivli can only hold register 0~31.  */
  68            ? (IN_RANGE (nunits.to_constant (), 0, 31))
  69            /* Only allowed in VLS-VLMAX mode.  */
  70            : false;
  71 }
  72
  73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
  74 static bool
  75 is_vlmax_len_p (machine_mode mode, rtx len)
  76 {
  77   poly_int64 value;
  78   return poly_int_rtx_p (len, &value)
  79          && known_eq (value, GET_MODE_NUNITS (mode));
  80 }
  81
  82 /* Helper functions for insn_flags && insn_types */
  83
  84 /* Return true if caller need pass mask operand for insn pattern with
  85    INSN_FLAGS. */
  86
  87 static bool
  88 need_mask_operand_p (unsigned insn_flags)
  89 {
  90   return (insn_flags & HAS_MASK_P)
  91          && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
  92 }
  93
  94 template <int MAX_OPERANDS> class insn_expander
  95 {
  96 public:
  97   insn_expander () = delete;
  98
  99   insn_expander (unsigned insn_flags, bool vlmax_p)
 100     : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
 101       m_vl_op (NULL_RTX)
 102   {
 103     check_insn_flags ();
 104   }
 105
 106   void check_insn_flags () const
 107   {
 108     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 109       /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P.  */
 110       gcc_assert ((m_insn_flags & HAS_MASK_P));
 111
 112     if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 113       /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P.  */
 114       gcc_assert ((m_insn_flags & HAS_MASK_P));
 115
 116     /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive.  */
 117     gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
 118                   && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
 119
 120     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 121       /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P.  */
 122       gcc_assert ((m_insn_flags & HAS_MERGE_P));
 123
 124     /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive.  */
 125     gcc_assert (
 126       !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
 127
 128     /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive.  */
 129     gcc_assert (
 130       !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
 131
 132     /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
 133        exclusive.  */
 134     gcc_assert (
 135       !((m_insn_flags & NULLARY_OP_P)
 136         && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 137             || (m_insn_flags & TERNARY_OP_P))));
 138     gcc_assert (
 139       !((m_insn_flags & UNARY_OP_P)
 140         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 141             || (m_insn_flags & TERNARY_OP_P))));
 142     gcc_assert (
 143       !((m_insn_flags & BINARY_OP_P)
 144         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 145             || (m_insn_flags & TERNARY_OP_P))));
 146     gcc_assert (
 147       !((m_insn_flags & TERNARY_OP_P)
 148         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 149             || (m_insn_flags & BINARY_OP_P))));
 150   }
 151
 152   void set_vl (rtx vl) { m_vl_op = vl; }
 153
 154   void add_output_operand (rtx x, machine_mode mode)
 155   {
 156     create_output_operand (&m_ops[m_opno++], x, mode);
 157     gcc_assert (m_opno <= MAX_OPERANDS);
 158   }
 159   void add_input_operand (rtx x, machine_mode mode)
 160   {
 161     create_input_operand (&m_ops[m_opno++], x, mode);
 162     gcc_assert (m_opno <= MAX_OPERANDS);
 163   }
 164   void add_all_one_mask_operand (machine_mode mask_mode)
 165   {
 166     add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
 167   }
 168   void add_first_one_true_mask_operand (machine_mode mask_mode)
 169   {
 170     add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
 171   }
 172   void add_vundef_operand (machine_mode dest_mode)
 173   {
 174     add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
 175   }
 176   void add_policy_operand ()
 177   {
 178     if (m_insn_flags & TU_POLICY_P)
 179       {
 180         rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
 181         add_input_operand (tail_policy_rtx, Pmode);
 182       }
 183     else if (m_insn_flags & TDEFAULT_POLICY_P)
 184       {
 185         rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
 186         add_input_operand (tail_policy_rtx, Pmode);
 187       }
 188
 189     if (m_insn_flags & MU_POLICY_P)
 190       {
 191         rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
 192         add_input_operand (mask_policy_rtx, Pmode);
 193       }
 194     else if (m_insn_flags & MDEFAULT_POLICY_P)
 195       {
 196         rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
 197         add_input_operand (mask_policy_rtx, Pmode);
 198       }
 199   }
 200   void add_avl_type_operand (avl_type type)
 201   {
 202     add_input_operand (gen_int_mode (type, Pmode), Pmode);
 203   }
 204
 205   void
 206   add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
 207   {
 208     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 209     add_input_operand (frm_rtx, Pmode);
 210   }
 211
 212   void
 213   add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
 214   {
 215     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 216     add_input_operand (frm_rtx, Pmode);
 217   }
 218
 219   /* Return the vtype mode based on insn_flags.
 220      vtype mode mean the mode vsetvl insn set. */
 221   machine_mode
 222   get_vtype_mode (rtx *ops)
 223   {
 224     machine_mode vtype_mode;
 225     if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
 226       vtype_mode = GET_MODE (ops[1]);
 227     else
 228       vtype_mode = GET_MODE (ops[0]);
 229     return vtype_mode;
 230   }
 231
 232   void emit_insn (enum insn_code icode, rtx *ops)
 233   {
 234     int opno = 0;
 235     int num_ops;
 236     /* It's true if any operand is memory operand.  */
 237     bool any_mem_p = false;
 238
 239     machine_mode vtype_mode = get_vtype_mode (ops);
 240     machine_mode mask_mode = get_mask_mode (vtype_mode);
 241
 242     /* Add dest operand.  */
 243     if (m_insn_flags & HAS_DEST_P)
 244       {
 245         rtx op = ops[opno++];
 246         any_mem_p |= MEM_P (op);
 247         add_output_operand (op, GET_MODE (op));
 248       }
 249
 250     /* Add mask operand.  */
 251     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 252       add_first_one_true_mask_operand (mask_mode);
 253     else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 254       add_all_one_mask_operand (mask_mode);
 255     else if (m_insn_flags & HAS_MASK_P)
 256       {
 257         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 258         gcc_assert (mode != VOIDmode);
 259         add_input_operand (ops[opno++], mode);
 260       }
 261
 262     /* Add merge operand.  */
 263     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 264       /* Same as dest operand.  */
 265       add_vundef_operand (GET_MODE (ops[0]));
 266     else if (m_insn_flags & HAS_MERGE_P)
 267       {
 268         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 269         gcc_assert (mode != VOIDmode);
 270         add_input_operand (ops[opno++], mode);
 271       }
 272
 273     if (m_insn_flags & NULLARY_OP_P)
 274       num_ops = 0;
 275     else if (m_insn_flags & UNARY_OP_P)
 276       num_ops = 1;
 277     else if (m_insn_flags & BINARY_OP_P)
 278       num_ops = 2;
 279     else if (m_insn_flags & TERNARY_OP_P)
 280       num_ops = 3;
 281     else
 282       gcc_unreachable ();
 283
 284     /* Add the remain operands.  */
 285     for (; num_ops; num_ops--, opno++)
 286       {
 287         any_mem_p |= MEM_P (ops[opno]);
 288         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 289         /* 'create_input_operand doesn't allow VOIDmode.
 290            According to vector.md, we may have some patterns that do not have
 291            explicit machine mode specifying the operand. Such operands are
 292            always Pmode.  */
 293         if (mode == VOIDmode)
 294           mode = Pmode;
 295
 296         /* Early assertion ensures same mode since maybe_legitimize_operand
 297            will check this.  */
 298         machine_mode required_mode = GET_MODE (ops[opno]);
 299         if (required_mode != VOIDmode && required_mode != mode)
 300           internal_error ("expected mode %s for operand %d of "
 301                           "insn %s but got mode %s.\n",
 302                           GET_MODE_NAME (mode),
 303                           opno,
 304                           insn_data[(int) icode].name,
 305                           GET_MODE_NAME (required_mode));
 306
 307         add_input_operand (ops[opno], mode);
 308       }
 309
 310     /* Add vl operand.  */
 311     rtx len = m_vl_op;
 312     bool vls_p = false;
 313     if (m_vlmax_p)
 314       {
 315         if (riscv_v_ext_vls_mode_p (vtype_mode))
 316           {
 317             /* VLS modes always set VSETVL by
 318                "vsetvl zero, rs1/imm".  */
 319             poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
 320             len = gen_int_mode (nunits, Pmode);
 321             vls_p = true;
 322           }
 323         else if (can_create_pseudo_p ())
 324           {
 325             len = gen_reg_rtx (Pmode);
 326             emit_vlmax_vsetvl (vtype_mode, len);
 327           }
 328       }
 329
 330     gcc_assert (len != NULL_RTX);
 331     add_input_operand (len, Pmode);
 332
 333     /* Add tail and mask policy operands.  */
 334     add_policy_operand ();
 335
 336     /* Add avl_type operand.  */
 337     add_avl_type_operand (
 338       vls_p ? avl_type::VLS
 339             : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
 340
 341     /* Add rounding mode operand.  */
 342     if (m_insn_flags & FRM_DYN_P)
 343       add_rounding_mode_operand (FRM_DYN);
 344     else if (m_insn_flags & FRM_RUP_P)
 345       add_rounding_mode_operand (FRM_RUP);
 346     else if (m_insn_flags & FRM_RDN_P)
 347       add_rounding_mode_operand (FRM_RDN);
 348     else if (m_insn_flags & FRM_RMM_P)
 349       add_rounding_mode_operand (FRM_RMM);
 350     else if (m_insn_flags & FRM_RNE_P)
 351       add_rounding_mode_operand (FRM_RNE);
 352     else if (m_insn_flags & VXRM_RNU_P)
 353       add_rounding_mode_operand (VXRM_RNU);
 354     else if (m_insn_flags & VXRM_RDN_P)
 355       add_rounding_mode_operand (VXRM_RDN);
 356
 357
 358     if (insn_data[(int) icode].n_operands != m_opno)
 359       internal_error ("invalid number of operands for insn %s, "
 360                       "expected %d but got %d.\n",
 361                       insn_data[(int) icode].name,
 362                       insn_data[(int) icode].n_operands, m_opno);
 363
 364     expand (icode, any_mem_p);
 365   }
 366
 367   void expand (enum insn_code icode, bool temporary_volatile_p = false)
 368   {
 369     if (temporary_volatile_p)
 370       {
 371         temporary_volatile_ok v (true);
 372         expand_insn (icode, m_opno, m_ops);
 373       }
 374     else
 375       expand_insn (icode, m_opno, m_ops);
 376   }
 377
 378 private:
 379   unsigned m_insn_flags;
 380   int m_opno;
 381   bool m_vlmax_p;
 382   rtx m_vl_op;
 383   expand_operand m_ops[MAX_OPERANDS];
 384 };
 385
 386 /* Emit an RVV insn with a vector length that equals the number of units of the
 387    vector mode.  For VLA modes this corresponds to VLMAX.
 388
 389    Unless the vector length can be encoded in the vsetivl[i] instruction this
 390    function must only be used as long as we can create pseudo registers. This is
 391    because it will set a pseudo register to VLMAX using vsetvl and use this as
 392    definition for the vector length.  */
 393 void
 394 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 395 {
 396   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 397   gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
 398
 399   e.emit_insn ((enum insn_code) icode, ops);
 400 }
 401
 402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
 403    registers anymore.  This function, however, takes a predefined vector length
 404    from the value in VL. */
 405 void
 406 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 407 {
 408   gcc_assert (!can_create_pseudo_p ());
 409   machine_mode mode = GET_MODE (ops[0]);
 410
 411   if (imm_avl_p (mode))
 412     {
 413       /* Even though VL is a real hardreg already allocated since
 414          it is post-RA now, we still gain benefits that we emit
 415          vsetivli zero, imm instead of vsetvli VL, zero which is
 416          we can be more flexible in post-RA instruction scheduling.  */
 417       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 418       e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
 419       e.emit_insn ((enum insn_code) icode, ops);
 420     }
 421   else
 422     {
 423       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 424       e.set_vl (vl);
 425       e.emit_insn ((enum insn_code) icode, ops);
 426     }
 427 }
 428
 429 /* Emit an RVV insn with a predefined vector length.  Contrary to
 430    emit_vlmax_insn the instruction's vector length is not deduced from its mode
 431    but taken from  the value in VL.  */
 432 void
 433 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 434 {
 435   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 436   e.set_vl (vl);
 437   e.emit_insn ((enum insn_code) icode, ops);
 438 }
 439
 440 /* Return true if the vector duplicated by a super element which is the fusion
 441    of consecutive elements.
 442
 443      v = { a, b, a, b } super element = ab, v = { ab, ab }  */
 444 bool
 445 rvv_builder::can_duplicate_repeating_sequence_p ()
 446 {
 447   poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
 448   unsigned int new_inner_size = m_inner_bits_size * npatterns ();
 449   if (m_inner_mode == Pmode
 450       || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
 451       || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
 452       || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
 453     return false;
 454   return repeating_sequence_p (0, encoded_nelts (), npatterns ());
 455 }
 456
 457 /* Return true if the vector is a simple sequence with one pattern and all
 458    elements the same.  */
 459 bool
 460 rvv_builder::is_repeating_sequence ()
 461 {
 462   if (npatterns () > 1)
 463     return false;
 464   return repeating_sequence_p (0, encoded_nelts (), 1);
 465 }
 466
 467 /* Return true if it is a repeating sequence that using
 468    merge approach has better codegen than using default
 469    approach (slide1down).
 470
 471    Sequence A:
 472      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 473
 474    nelts = 16
 475    npatterns = 2
 476
 477    for merging a we need mask 101010....
 478    for merging b we need mask 010101....
 479
 480    Foreach element in the npattern, we need to build a mask in scalar register.
 481    Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
 482    instructions and 1 scalar move to v0 register.  Finally we need vector merge
 483    to merge them.
 484
 485    lui          a5, #imm
 486    add          a5, #imm
 487    vmov.s.x     v0, a5
 488    vmerge.vxm   v9, v9, a1, v0
 489
 490    So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
 491    If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
 492    So return true in this case as it is profitable.
 493
 494    Sequence B:
 495      {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
 496
 497    nelts = 16
 498    npatterns = 8
 499
 500    COST of merge approach = (3 + 1) * npatterns = 24
 501    COST of slide1down approach = nelts = 16
 502    Return false in this case as it is NOT profitable in merge approach.
 503 */
 504 bool
 505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
 506 {
 507   if (inner_bytes_size () > UNITS_PER_WORD)
 508     return false;
 509
 510   unsigned int nelts = full_nelts ().to_constant ();
 511
 512   if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
 513     return false;
 514
 515   unsigned int merge_cost = 1;
 516   unsigned int build_merge_mask_cost = 3;
 517   unsigned int slide1down_cost = nelts;
 518
 519   return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
 520 }
 521
 522 /* Return true if it's worthwhile to use slideup combine 2 vectors.  */
 523 bool
 524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
 525 {
 526   int nelts = full_nelts ().to_constant ();
 527   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 528   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 529
 530   /* ??? Current heuristic we do is we do combine 2 vectors
 531      by slideup when:
 532        1. # of leading same elements is equal to # of trailing same elements.
 533        2. Both of above are equal to nelts / 2.
 534      Otherwise, it is not profitable.  */
 535   return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
 536 }
 537
 538 /* Return true if it's worthwhile to use merge combine vector with a scalar.  */
 539 bool
 540 rvv_builder::combine_sequence_use_merge_profitable_p ()
 541 {
 542   int nelts = full_nelts ().to_constant ();
 543   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 544   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 545   int nregs = riscv_get_v_regno_alignment (int_mode ());
 546
 547   if (leading_ndups + trailing_ndups != nelts)
 548     return false;
 549
 550   /* Leading elements num > 255 which exceeds the maximum value
 551      of QImode, we will need to use HImode.  */
 552   machine_mode mode;
 553   if (leading_ndups > 255 || nregs > 2)
 554     {
 555       if (!get_vector_mode (HImode, nelts).exists (&mode))
 556         return false;
 557       /* We will need one more AVL/VL toggling vsetvl instruction.  */
 558       return leading_ndups > 4 && trailing_ndups > 4;
 559     }
 560
 561   /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
 562      consume 3 slide instructions.  */
 563   return leading_ndups > 3 && trailing_ndups > 3;
 564 }
 565
 566 /* Merge the repeating sequence into a single element and return the RTX.  */
 567 rtx
 568 rvv_builder::get_merged_repeating_sequence ()
 569 {
 570   scalar_int_mode mode = Pmode;
 571   rtx target = gen_reg_rtx (mode);
 572   emit_move_insn (target, const0_rtx);
 573   rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
 574   /* { a, b, a, b }: Generate duplicate element = b << bits | a.  */
 575   for (unsigned int i = 0; i < npatterns (); i++)
 576     {
 577       unsigned int loc = m_inner_bits_size * i;
 578       rtx shift = gen_int_mode (loc, mode);
 579       rtx ele = gen_lowpart (mode, elt (i));
 580       rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
 581                                      OPTAB_DIRECT);
 582       rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
 583                                       OPTAB_DIRECT);
 584       rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
 585                                       OPTAB_DIRECT);
 586       emit_move_insn (target, tmp3);
 587     }
 588   if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
 589     return gen_lowpart (m_new_inner_mode, target);
 590   return target;
 591 }
 592
 593 /* Get the mask for merge approach.
 594
 595    Consider such following case:
 596      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 597    To merge "a", the mask should be 1010....
 598    To merge "b", the mask should be 0101....
 599 */
 600 rtx
 601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
 602                                     machine_mode inner_mode) const
 603 {
 604   unsigned HOST_WIDE_INT mask = 0;
 605   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
 606   /* Here we construct a mask pattern that will later be broadcast
 607      to a vector register.  The maximum broadcast size for vmv.v.x/vmv.s.x
 608      is determined by the length of a vector element (ELEN) and not by
 609      XLEN so make sure we do not exceed it.  One example is -march=zve32*
 610      which mandates ELEN == 32 but can be combined with -march=rv64
 611      with XLEN == 64.  */
 612   unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
 613
 614   gcc_assert (elen % npatterns () == 0);
 615
 616   int limit = elen / npatterns ();
 617
 618   for (int i = 0; i < limit; i++)
 619     mask |= base_mask << (i * npatterns ());
 620
 621   return gen_int_mode (mask, inner_mode);
 622 }
 623
 624 /* Return true if the variable-length vector is single step.
 625    Single step means step all patterns in NPATTERNS are equal.
 626    Consider this following case:
 627
 628      CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 629        { 0, 2, 2, 4, 4, 6, ... }
 630      First pattern: step1 = 2 - 0 = 2
 631                     step2 = 4 - 2 = 2
 632      Second pattern: step1 = 4 - 2 = 2
 633                      step2 = 6 - 4 = 2
 634      Since all steps of NPATTERNS are equal step = 2.
 635      Return true in this case.
 636
 637      CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 638        { 0, 1, 2, 4, 4, 7, ... }
 639      First pattern: step1 = 2 - 0 = 2
 640                     step2 = 4 - 2 = 2
 641      Second pattern: step1 = 4 - 1 = 3
 642                      step2 = 7 - 4 = 3
 643      Since not all steps are equal, return false.  */
 644 bool
 645 rvv_builder::single_step_npatterns_p () const
 646 {
 647   if (nelts_per_pattern () != 3)
 648     return false;
 649
 650   poly_int64 step
 651     = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
 652   for (unsigned int i = 0; i < npatterns (); i++)
 653     {
 654       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 655       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 656       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 657       poly_int64 diff1 = ele1 - ele0;
 658       poly_int64 diff2 = ele2 - ele1;
 659       if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
 660         return false;
 661     }
 662   return true;
 663 }
 664
 665 /* Return true if the diff between const vector and vid sequence
 666    is repeated. For example as below cases:
 667    The diff means the const vector - vid.
 668      CASE 1:
 669      CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
 670      VID         : {0, 1, 2, 3, 4, 5, 6, 7, ... }
 671      DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
 672      The diff sequence {3, 1,-1,-3} is repeated in the npattern and
 673      return TRUE for case 1.
 674
 675      CASE 2:
 676      CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
 677      VID         : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
 678      DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
 679      The diff sequence {-4, 3} is not repeated in the npattern and
 680      return FALSE for case 2.  */
 681 bool
 682 rvv_builder::npatterns_vid_diff_repeated_p () const
 683 {
 684   if (nelts_per_pattern () != 3)
 685     return false;
 686   else if (npatterns () == 0)
 687     return false;
 688
 689   for (unsigned i = 0; i < npatterns (); i++)
 690     {
 691       poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
 692       poly_int64 diff_1
 693         = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
 694
 695       if (maybe_ne (diff_0, diff_1))
 696         return false;
 697     }
 698
 699   return true;
 700 }
 701
 702 /* Return true if the permutation consists of two
 703    interleaved patterns with a constant step each.
 704    TODO: We currently only support NPATTERNS = 2.  */
 705 bool
 706 rvv_builder::interleaved_stepped_npatterns_p () const
 707 {
 708   if (npatterns () != 2 || nelts_per_pattern () != 3)
 709     return false;
 710   for (unsigned int i = 0; i < npatterns (); i++)
 711     {
 712       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 713       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 714       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 715       poly_int64 diff1 = ele1 - ele0;
 716       poly_int64 diff2 = ele2 - ele1;
 717       if (maybe_ne (diff1, diff2))
 718         return false;
 719     }
 720   return true;
 721 }
 722
 723 /* Return true if all elements of NPATTERNS are equal.
 724
 725    E.g. NPATTERNS = 4:
 726      { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
 727    E.g. NPATTERNS = 8:
 728      { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
 729    We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
 730    We don't need to check the elements[n] with n >= NPATTERNS since
 731    they don't belong to the same pattern.
 732 */
 733 bool
 734 rvv_builder::npatterns_all_equal_p () const
 735 {
 736   poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
 737   for (unsigned int i = 1; i < npatterns (); i++)
 738     {
 739       poly_int64 ele = rtx_to_poly_int64 (elt (i));
 740       if (!known_eq (ele, ele0))
 741         return false;
 742     }
 743   return true;
 744 }
 745
 746 static unsigned
 747 get_sew (machine_mode mode)
 748 {
 749   unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
 750                        ? 8
 751                        : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
 752   return sew;
 753 }
 754
 755 /* Return true if X is a const_vector with all duplicate elements, which is in
 756    the range between MINVAL and MAXVAL.  */
 757 bool
 758 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
 759                                HOST_WIDE_INT maxval)
 760 {
 761   rtx elt;
 762   return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
 763           && IN_RANGE (INTVAL (elt), minval, maxval));
 764 }
 765
 766 /* Return true if VEC is a constant in which every element is in the range
 767    [MINVAL, MAXVAL].  The elements do not need to have the same value.
 768
 769    This function also exists in aarch64, we may unify it in middle-end in the
 770    future.  */
 771
 772 static bool
 773 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
 774 {
 775   if (!CONST_VECTOR_P (vec)
 776       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
 777     return false;
 778
 779   int nunits;
 780   if (!CONST_VECTOR_STEPPED_P (vec))
 781     nunits = const_vector_encoded_nelts (vec);
 782   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
 783     return false;
 784
 785   for (int i = 0; i < nunits; i++)
 786     {
 787       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
 788       poly_int64 value;
 789       if (!poly_int_rtx_p (vec_elem, &value)
 790           || maybe_lt (value, minval)
 791           || maybe_gt (value, maxval))
 792         return false;
 793     }
 794   return true;
 795 }
 796
 797 /* Returns true if the vector's elements are all duplicates in
 798    range -16 ~ 15 integer or 0.0 floating-point.  */
 799
 800 bool
 801 valid_vec_immediate_p (rtx x)
 802 {
 803   return (satisfies_constraint_vi (x) || satisfies_constraint_Wc0 (x));
 804 }
 805
 806 /* Return a const vector of VAL. The VAL can be either const_int or
 807    const_poly_int.  */
 808
 809 static rtx
 810 gen_const_vector_dup (machine_mode mode, poly_int64 val)
 811 {
 812   scalar_mode smode = GET_MODE_INNER (mode);
 813   rtx c = gen_int_mode (val, smode);
 814   if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
 815     {
 816       /* When VAL is const_poly_int value, we need to explicitly broadcast
 817          it into a vector using RVV broadcast instruction.  */
 818       return expand_vector_broadcast (mode, c);
 819     }
 820    return gen_const_vec_duplicate (mode, c);
 821 }
 822
 823 /* Emit a vlmax vsetvl instruction.  This should only be used when
 824    optimization is disabled or after vsetvl insertion pass.  */
 825 void
 826 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
 827 {
 828   unsigned int sew = get_sew (vmode);
 829   emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
 830                          gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
 831                          const0_rtx));
 832 }
 833
 834 void
 835 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
 836 {
 837   unsigned int sew = get_sew (vmode);
 838   enum vlmul_type vlmul = get_vlmul (vmode);
 839   unsigned int ratio = calculate_ratio (sew, vlmul);
 840
 841   if (!optimize)
 842     emit_hard_vlmax_vsetvl (vmode, vl);
 843   else
 844     emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
 845 }
 846
 847 /* Calculate SEW/LMUL ratio.  */
 848 unsigned int
 849 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
 850 {
 851   unsigned int ratio;
 852   switch (vlmul)
 853     {
 854     case LMUL_1:
 855       ratio = sew;
 856       break;
 857     case LMUL_2:
 858       ratio = sew / 2;
 859       break;
 860     case LMUL_4:
 861       ratio = sew / 4;
 862       break;
 863     case LMUL_8:
 864       ratio = sew / 8;
 865       break;
 866     case LMUL_F8:
 867       ratio = sew * 8;
 868       break;
 869     case LMUL_F4:
 870       ratio = sew * 4;
 871       break;
 872     case LMUL_F2:
 873       ratio = sew * 2;
 874       break;
 875     default:
 876       gcc_unreachable ();
 877     }
 878   return ratio;
 879 }
 880
 881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
 882    compile-time unknown). ZVL means that the vector-length is specific
 883    (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
 884    auto-vectorization using VLMAX vsetvl configuration.  */
 885 static bool
 886 autovec_use_vlmax_p (void)
 887 {
 888   return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE
 889           || rvv_vector_bits == RVV_VECTOR_BITS_ZVL;
 890 }
 891
 892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
 893    is a const duplicate vector. Otherwise, emit vrgather.vv.  */
 894 static void
 895 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
 896 {
 897   rtx elt;
 898   insn_code icode;
 899   machine_mode data_mode = GET_MODE (target);
 900   machine_mode sel_mode = GET_MODE (sel);
 901   if (const_vec_duplicate_p (sel, &elt))
 902     {
 903       icode = code_for_pred_gather_scalar (data_mode);
 904       sel = elt;
 905     }
 906   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 907     icode = code_for_pred_gatherei16 (data_mode);
 908   else
 909     icode = code_for_pred_gather (data_mode);
 910   rtx ops[] = {target, op, sel};
 911   emit_vlmax_insn (icode, BINARY_OP, ops);
 912 }
 913
 914 static void
 915 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
 916 {
 917   rtx elt;
 918   insn_code icode;
 919   machine_mode data_mode = GET_MODE (target);
 920   machine_mode sel_mode = GET_MODE (sel);
 921   if (const_vec_duplicate_p (sel, &elt))
 922     {
 923       icode = code_for_pred_gather_scalar (data_mode);
 924       sel = elt;
 925     }
 926   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 927     icode = code_for_pred_gatherei16 (data_mode);
 928   else
 929     icode = code_for_pred_gather (data_mode);
 930   rtx ops[] = {target, mask, target, op, sel};
 931   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 932 }
 933
 934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
 935    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 936
 937   There is no inverse vdecompress provided, as this operation can be readily
 938   synthesized using iota and a masked vrgather:
 939
 940       Desired functionality of 'vdecompress'
 941         7 6 5 4 3 2 1 0     # vid
 942
 943               e d c b a     # packed vector of 5 elements
 944         1 0 0 1 1 1 0 1     # mask vector of 8 elements
 945         p q r s t u v w     # destination register before vdecompress
 946
 947         e q r d c b v a     # result of vdecompress
 948        # v0 holds mask
 949        # v1 holds packed data
 950        # v11 holds input expanded vector and result
 951        viota.m v10, v0                 # Calc iota from mask in v0
 952        vrgather.vv v11, v1, v10, v0.t  # Expand into destination
 953      p q r s t u v w  # v11 destination register
 954            e d c b a  # v1 source vector
 955      1 0 0 1 1 1 0 1  # v0 mask vector
 956
 957      4 4 4 3 2 1 1 0  # v10 result of viota.m
 958      e q r d c b v a  # v11 destination after vrgather using viota.m under mask
 959 */
 960 static void
 961 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
 962 {
 963   machine_mode data_mode = GET_MODE (target);
 964   machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
 965   if (GET_MODE_INNER (data_mode) == QImode)
 966     sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
 967
 968   rtx sel = gen_reg_rtx (sel_mode);
 969   rtx iota_ops[] = {sel, mask};
 970   emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
 971   emit_vlmax_gather_insn (target, op0, sel);
 972   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 973 }
 974
 975 /* Emit merge instruction.  */
 976
 977 static machine_mode
 978 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
 979                                          machine_mode mask_bit_mode)
 980 {
 981   unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
 982   unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
 983     ? builder.inner_bits_size () : mask_precision;
 984
 985   scalar_mode inner_mode;
 986   unsigned minimal_bits_size;
 987
 988   switch (mask_scalar_size)
 989     {
 990       case 8:
 991         inner_mode = QImode;
 992         minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
 993         break;
 994       case 16:
 995         inner_mode = HImode;
 996         minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
 997         break;
 998       case 32:
 999         inner_mode = SImode;
1000         minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
1001         break;
1002       case 64:
1003         inner_mode = DImode;
1004         minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
1005         break;
1006       default:
1007         gcc_unreachable ();
1008         break;
1009     }
1010
1011   gcc_assert (mask_precision % mask_scalar_size == 0);
1012
1013   uint64_t dup_nunit = mask_precision > mask_scalar_size
1014     ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1015
1016   return get_vector_mode (inner_mode, dup_nunit).require ();
1017 }
1018
1019 /* Expand series const vector.  If VID is NULL_RTX, we use vid.v
1020    instructions to generate sequence for VID:
1021
1022      VID = { 0, 1, 2, 3, ... }
1023
1024    Otherwise, we use the VID argument directly.  */
1025
1026 void
1027 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1028 {
1029   machine_mode mode = GET_MODE (dest);
1030   poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1031   poly_int64 value;
1032   rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1033
1034   /* VECT_IV = BASE + I * STEP.  */
1035
1036   /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v.  */
1037   bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1038                    && poly_int_rtx_p (base, &value)
1039                    && known_eq (nunits_m1, value);
1040   if (!vid)
1041     {
1042       vid = gen_reg_rtx (mode);
1043       rtx op[] = {vid};
1044       emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1045     }
1046
1047   rtx step_adj;
1048   if (reverse_p)
1049     {
1050       /* Special case:
1051            {nunits - 1, nunits - 2, ... , 0}.
1052            nunits can be either const_int or const_poly_int.
1053
1054          Code sequence:
1055            vid.v v
1056            vrsub nunits - 1, v.  */
1057       rtx ops[]
1058         = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1059       insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1060       emit_vlmax_insn (icode, BINARY_OP, ops);
1061     }
1062   else
1063     {
1064       /* Step 2: Generate I * STEP.
1065          - STEP is 1, we don't emit any instructions.
1066          - STEP is power of 2, we use vsll.vi/vsll.vx.
1067          - STEP is non-power of 2, we use vmul.vx.  */
1068       if (rtx_equal_p (step, const1_rtx))
1069         step_adj = vid;
1070       else
1071         {
1072           step_adj = gen_reg_rtx (mode);
1073           if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1074             {
1075               /* Emit logical left shift operation.  */
1076               int shift = exact_log2 (INTVAL (step));
1077               rtx shift_amount = gen_int_mode (shift, Pmode);
1078               insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1079               rtx ops[] = {step_adj, vid, shift_amount};
1080               emit_vlmax_insn (icode, BINARY_OP, ops);
1081             }
1082           else
1083             {
1084               insn_code icode = code_for_pred_scalar (MULT, mode);
1085               rtx ops[] = {step_adj, vid, step};
1086               emit_vlmax_insn (icode, BINARY_OP, ops);
1087             }
1088         }
1089
1090       /* Step 3: Generate BASE + I * STEP.
1091           - BASE is 0, use result of vid.
1092           - BASE is not 0, we use vadd.vx/vadd.vi.  */
1093       if (rtx_equal_p (base, const0_rtx))
1094         emit_move_insn (result, step_adj);
1095       else
1096         {
1097           insn_code icode = code_for_pred_scalar (PLUS, mode);
1098           rtx ops[] = {result, step_adj, base};
1099           emit_vlmax_insn (icode, BINARY_OP, ops);
1100         }
1101     }
1102
1103   if (result != dest)
1104     emit_move_insn (dest, result);
1105 }
1106
1107 /* Subroutine of riscv_vector_expand_vector_init.
1108    Works as follows:
1109    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110    (b) Skip leading elements from BUILDER, which are the same as
1111        element NELTS_REQD - 1.
1112    (c) Insert earlier elements in reverse order in TARGET using vslide1down.  */
1113
1114 static void
1115 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
1116                                  int nelts_reqd)
1117 {
1118   machine_mode mode = GET_MODE (target);
1119   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
1120   emit_move_insn (target, dup);
1121   int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1122   for (int i = ndups; i < nelts_reqd; i++)
1123     {
1124       unsigned int unspec
1125         = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
1126       insn_code icode = code_for_pred_slide (unspec, mode);
1127       rtx ops[] = {target, target, builder.elt (i)};
1128       emit_vlmax_insn (icode, BINARY_OP, ops);
1129     }
1130 }
1131
1132 /* Subroutine of expand_vec_init to handle case
1133    when all trailing elements of builder are same.
1134    This works as follows:
1135    (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136    (b) Insert remaining elements in TARGET using insr.
1137
1138    ??? The heuristic used is to do above if number of same trailing elements
1139    is greater than leading_ndups, loosely based on
1140    heuristic from mostly_zeros_p.  May need fine-tuning.  */
1141
1142 static bool
1143 expand_vector_init_trailing_same_elem (rtx target,
1144                                        const rtx_vector_builder &builder,
1145                                        int nelts_reqd)
1146 {
1147   int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1148   int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
1149   machine_mode mode = GET_MODE (target);
1150
1151   if (trailing_ndups > leading_ndups)
1152     {
1153       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
1154       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
1155         {
1156           unsigned int unspec
1157             = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
1158           insn_code icode = code_for_pred_slide (unspec, mode);
1159           rtx tmp = gen_reg_rtx (mode);
1160           rtx ops[] = {tmp, dup, builder.elt (i)};
1161           emit_vlmax_insn (icode, BINARY_OP, ops);
1162           /* slide1up need source and dest to be different REG.  */
1163           dup = tmp;
1164         }
1165
1166       emit_move_insn (target, dup);
1167       return true;
1168     }
1169
1170   return false;
1171 }
1172
1173 static void
1174 expand_const_vector (rtx target, rtx src)
1175 {
1176   machine_mode mode = GET_MODE (target);
1177   rtx result = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1178   rtx elt;
1179   if (const_vec_duplicate_p (src, &elt))
1180     {
1181       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1182         {
1183           gcc_assert (rtx_equal_p (elt, const0_rtx)
1184                       || rtx_equal_p (elt, const1_rtx));
1185           rtx ops[] = {result, src};
1186           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1187         }
1188       /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189          we use vmv.v.i instruction.  */
1190       else if (valid_vec_immediate_p (src))
1191         {
1192           rtx ops[] = {result, src};
1193           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1194         }
1195       else
1196         {
1197           /* Emit vec_duplicate<mode> split pattern before RA so that
1198              we could have a better optimization opportunity in LICM
1199              which will hoist vmv.v.x outside the loop and in fwprop && combine
1200              which will transform 'vv' into 'vx' instruction.
1201
1202              The reason we don't emit vec_duplicate<mode> split pattern during
1203              RA since the split stage after RA is a too late stage to generate
1204              RVV instruction which need an additional register (We can't
1205              allocate a new register after RA) for VL operand of vsetvl
1206              instruction (vsetvl a5, zero).  */
1207           if (lra_in_progress)
1208             {
1209               rtx ops[] = {result, elt};
1210               emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1211             }
1212           else
1213             {
1214               struct expand_operand ops[2];
1215               enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1216               gcc_assert (icode != CODE_FOR_nothing);
1217               create_output_operand (&ops[0], result, mode);
1218               create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1219               expand_insn (icode, 2, ops);
1220               result = ops[0].value;
1221             }
1222         }
1223
1224       if (result != target)
1225         emit_move_insn (target, result);
1226       return;
1227     }
1228
1229   /* Support scalable const series vector.  */
1230   rtx base, step;
1231   if (const_vec_series_p (src, &base, &step))
1232     {
1233       expand_vec_series (result, base, step);
1234
1235       if (result != target)
1236         emit_move_insn (target, result);
1237       return;
1238     }
1239
1240   /* Handle variable-length vector.  */
1241   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1242   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1243   rvv_builder builder (mode, npatterns, nelts_per_pattern);
1244   for (unsigned int i = 0; i < nelts_per_pattern; i++)
1245     {
1246       for (unsigned int j = 0; j < npatterns; j++)
1247         builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1248     }
1249   builder.finalize ();
1250
1251   if (CONST_VECTOR_DUPLICATE_P (src))
1252     {
1253       /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254          E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255               NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256         The elements within NPATTERNS are not necessary regular.  */
1257       if (builder.can_duplicate_repeating_sequence_p ())
1258         {
1259           /* We handle the case that we can find a vector container to hold
1260              element bitsize = NPATTERNS * ele_bitsize.
1261
1262                NPATTERNS = 8, element width = 8
1263                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264                In this case, we can combine NPATTERNS element into a larger
1265                element. Use element width = 64 and broadcast a vector with
1266                all element equal to 0x0706050403020100.  */
1267           rtx ele = builder.get_merged_repeating_sequence ();
1268           rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1269           emit_move_insn (result, gen_lowpart (mode, dup));
1270         }
1271       else
1272         {
1273           /* We handle the case that we can't find a vector container to hold
1274              element bitsize = NPATTERNS * ele_bitsize.
1275
1276                NPATTERNS = 8, element width = 16
1277                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278                Since NPATTERNS * element width = 128, we can't find a container
1279                to hold it.
1280
1281                In this case, we use NPATTERNS merge operations to generate such
1282                vector.  */
1283           unsigned int nbits = npatterns - 1;
1284
1285           /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1286           rtx vid = gen_reg_rtx (builder.int_mode ());
1287           rtx op[] = {vid};
1288           emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1289                             NULLARY_OP, op);
1290
1291           /* Generate vid_repeat = { 0, 1, ... nbits, ... }  */
1292           rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1293           rtx and_ops[] = {vid_repeat, vid,
1294                            gen_int_mode (nbits, builder.inner_int_mode ())};
1295           emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1296                             BINARY_OP, and_ops);
1297
1298           rtx tmp1 = gen_reg_rtx (builder.mode ());
1299           rtx dup_ops[] = {tmp1, builder.elt (0)};
1300           emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1301                             dup_ops);
1302           for (unsigned int i = 1; i < builder.npatterns (); i++)
1303             {
1304               /* Generate mask according to i.  */
1305               rtx mask = gen_reg_rtx (builder.mask_mode ());
1306               rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1307               expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1308
1309               /* Merge scalar to each i.  */
1310               rtx tmp2 = gen_reg_rtx (builder.mode ());
1311               rtx merge_ops[] = {tmp2, tmp1, builder.elt (i), mask};
1312               insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1313               emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1314               tmp1 = tmp2;
1315             }
1316           emit_move_insn (result, tmp1);
1317         }
1318     }
1319   else if (CONST_VECTOR_STEPPED_P (src))
1320     {
1321       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1322       if (builder.single_step_npatterns_p ())
1323         {
1324           /* Describe the case by choosing NPATTERNS = 4 as an example.  */
1325           insn_code icode;
1326
1327           /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1328           rtx vid = gen_reg_rtx (builder.mode ());
1329           rtx vid_ops[] = {vid};
1330           icode = code_for_pred_series (builder.mode ());
1331           emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1332
1333           if (builder.npatterns_all_equal_p ())
1334             {
1335               /* Generate the variable-length vector following this rule:
1336                  { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337                    E.g. { 0, 0, 8, 8, 16, 16, ... } */
1338
1339               /* We want to create a pattern where value[idx] = floor (idx /
1340                  NPATTERNS). As NPATTERNS is always a power of two we can
1341                  rewrite this as = idx & -NPATTERNS.  */
1342               /* Step 2: VID AND -NPATTERNS:
1343                  { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1344               */
1345               rtx imm
1346                 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1347               rtx tmp1 = gen_reg_rtx (builder.mode ());
1348               rtx and_ops[] = {tmp1, vid, imm};
1349               icode = code_for_pred_scalar (AND, builder.mode ());
1350               emit_vlmax_insn (icode, BINARY_OP, and_ops);
1351
1352               /* Step 3: Convert to step size 1.  */
1353               rtx tmp2 = gen_reg_rtx (builder.mode ());
1354               /* log2 (npatterns) to get the shift amount to convert
1355                  Eg.  { 0, 0, 0, 0, 4, 4, ... }
1356                  into { 0, 0, 0, 0, 1, 1, ... }.  */
1357               HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ;
1358               rtx shift = gen_int_mode (shift_amt, builder.inner_mode ());
1359               rtx shift_ops[] = {tmp2, tmp1, shift};
1360               icode = code_for_pred_scalar (ASHIFTRT, builder.mode ());
1361               emit_vlmax_insn (icode, BINARY_OP, shift_ops);
1362
1363               /* Step 4: Multiply to step size n.  */
1364               HOST_WIDE_INT step_size =
1365                 INTVAL (builder.elt (builder.npatterns ()))
1366                 - INTVAL (builder.elt (0));
1367               rtx tmp3 = gen_reg_rtx (builder.mode ());
1368               if (pow2p_hwi (step_size))
1369                 {
1370                   /* Power of 2 can be handled with a left shift.  */
1371                   HOST_WIDE_INT shift = exact_log2 (step_size);
1372                   rtx shift_amount = gen_int_mode (shift, Pmode);
1373                   insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1374                   rtx ops[] = {tmp3, tmp2, shift_amount};
1375                   emit_vlmax_insn (icode, BINARY_OP, ops);
1376                 }
1377               else
1378                 {
1379                   rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ());
1380                   insn_code icode = code_for_pred_scalar (MULT, builder.mode ());
1381                   rtx ops[] = {tmp3, tmp2, mult_amt};
1382                   emit_vlmax_insn (icode, BINARY_OP, ops);
1383                 }
1384
1385               /* Step 5: Add starting value to all elements.  */
1386               HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1387               if (init_val == 0)
1388                 emit_move_insn (result, tmp3);
1389               else
1390                 {
1391                   rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1392                   rtx add_ops[] = {result, tmp3, dup};
1393                   icode = code_for_pred (PLUS, builder.mode ());
1394                   emit_vlmax_insn (icode, BINARY_OP, add_ops);
1395                 }
1396             }
1397           else
1398             {
1399               /* Generate the variable-length vector following this rule:
1400                 { a, b, a + step, b + step, a + step*2, b + step*2, ... }  */
1401
1402               if (builder.npatterns_vid_diff_repeated_p ())
1403                 {
1404                   /* Case 1: For example as below:
1405                      {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406                      We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407                      repeated as below after minus vid.
1408                      {3, 1, -1, -3, 3, 1, -1, -3...}
1409                      Then we can simplify the diff code gen to at most
1410                      npatterns().  */
1411                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1412
1413                   /* Step 1: Generate diff = TARGET - VID.  */
1414                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1415                     {
1416                      poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1417                      v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1418                     }
1419
1420                   /* Step 2: Generate result = VID + diff.  */
1421                   rtx vec = v.build ();
1422                   rtx add_ops[] = {result, vid, vec};
1423                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1424                                    BINARY_OP, add_ops);
1425                 }
1426               else
1427                 {
1428                   /* Case 2: For example as below:
1429                      { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1430                    */
1431                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1432
1433                   /* Step 1: Generate { a, b, a, b, ... }  */
1434                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1435                     v.quick_push (builder.elt (i));
1436                   rtx new_base = v.build ();
1437
1438                   /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS).  */
1439                   rtx shift_count
1440                     = gen_int_mode (exact_log2 (builder.npatterns ()),
1441                                     builder.inner_mode ());
1442                   rtx tmp1 = gen_reg_rtx (builder.mode ());
1443                   rtx shift_ops[] = {tmp1, vid, shift_count};
1444                   emit_vlmax_insn (code_for_pred_scalar
1445                                    (LSHIFTRT, builder.mode ()), BINARY_OP,
1446                                    shift_ops);
1447
1448                   /* Step 3: Generate tmp2 = tmp1 * step.  */
1449                   rtx tmp2 = gen_reg_rtx (builder.mode ());
1450                   rtx step
1451                     = simplify_binary_operation (MINUS, builder.inner_mode (),
1452                                                  builder.elt (v.npatterns()),
1453                                                  builder.elt (0));
1454                   expand_vec_series (tmp2, const0_rtx, step, tmp1);
1455
1456                   /* Step 4: Generate result = tmp2 + new_base.  */
1457                   rtx add_ops[] = {result, tmp2, new_base};
1458                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1459                                    BINARY_OP, add_ops);
1460                 }
1461             }
1462         }
1463       else if (builder.interleaved_stepped_npatterns_p ())
1464         {
1465           rtx base1 = builder.elt (0);
1466           rtx base2 = builder.elt (1);
1467           poly_int64 step1
1468             = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1469               - rtx_to_poly_int64 (base1);
1470           poly_int64 step2
1471             = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1472               - rtx_to_poly_int64 (base2);
1473
1474           /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1475              integer vector mode to generate such vector efficiently.
1476
1477              E.g. EEW = 16, { 2, 0, 4, 0, ... }
1478
1479              can be interpreted into:
1480
1481                   EEW = 32, { 2, 4, ... }.
1482
1483              This only works as long as the larger type does not overflow
1484              as we can't guarantee a zero value for each second element
1485              of the sequence with smaller EEW.
1486              ??? For now we assume that no overflow happens with positive
1487              steps and forbid negative steps altogether.  */
1488           unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1489           scalar_int_mode new_smode;
1490           machine_mode new_mode;
1491           poly_uint64 new_nunits
1492             = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1493           if (known_ge (step1, 0) && known_ge (step2, 0)
1494               && int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1495               && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1496             {
1497               rtx tmp1 = gen_reg_rtx (new_mode);
1498               base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1499               expand_vec_series (tmp1, base1, gen_int_mode (step1, new_smode));
1500
1501               if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1502                 /* { 1, 0, 2, 0, ... }.  */
1503                 emit_move_insn (result, gen_lowpart (mode, tmp1));
1504               else if (known_eq (step2, 0))
1505                 {
1506                   /* { 1, 1, 2, 1, ... }.  */
1507                   rtx scalar = expand_simple_binop (
1508                     Xmode, ASHIFT,
1509                     gen_int_mode (rtx_to_poly_int64 (base2), Xmode),
1510                     gen_int_mode (builder.inner_bits_size (), Xmode),
1511                     NULL_RTX, false, OPTAB_DIRECT);
1512                   scalar = simplify_gen_subreg (new_smode, scalar, Xmode, 0);
1513                   rtx tmp2 = gen_reg_rtx (new_mode);
1514                   rtx ior_ops[] = {tmp2, tmp1, scalar};
1515                   emit_vlmax_insn (code_for_pred_scalar (IOR, new_mode),
1516                                    BINARY_OP, ior_ops);
1517                   emit_move_insn (result, gen_lowpart (mode, tmp2));
1518                 }
1519               else
1520                 {
1521                   /* { 1, 3, 2, 6, ... }.  */
1522                   rtx tmp2 = gen_reg_rtx (new_mode);
1523                   base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1524                   expand_vec_series (tmp2, base2,
1525                                      gen_int_mode (step2, new_smode));
1526                   rtx shifted_tmp2 = expand_simple_binop (
1527                     new_mode, ASHIFT, tmp2,
1528                     gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1529                     false, OPTAB_DIRECT);
1530                   rtx tmp3 = gen_reg_rtx (new_mode);
1531                   rtx ior_ops[] = {tmp3, tmp1, shifted_tmp2};
1532                   emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1533                                    ior_ops);
1534                   emit_move_insn (result, gen_lowpart (mode, tmp3));
1535                 }
1536             }
1537           else
1538             {
1539               rtx vid = gen_reg_rtx (mode);
1540               expand_vec_series (vid, const0_rtx, const1_rtx);
1541               /* Transform into { 0, 0, 1, 1, 2, 2, ... }.  */
1542               rtx shifted_vid
1543                 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1544                                        NULL_RTX, false, OPTAB_DIRECT);
1545               rtx tmp1 = gen_reg_rtx (mode);
1546               rtx tmp2 = gen_reg_rtx (mode);
1547               expand_vec_series (tmp1, base1,
1548                                  gen_int_mode (step1, builder.inner_mode ()),
1549                                  shifted_vid);
1550               expand_vec_series (tmp2, base2,
1551                                  gen_int_mode (step2, builder.inner_mode ()),
1552                                  shifted_vid);
1553
1554               /* Transform into { 0, 1, 0, 1, 0, 1, ... }.  */
1555               rtx and_vid = gen_reg_rtx (mode);
1556               rtx and_ops[] = {and_vid, vid, const1_rtx};
1557               emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1558                                and_ops);
1559               rtx mask = gen_reg_rtx (builder.mask_mode ());
1560               expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1561
1562               rtx ops[] = {result, tmp1, tmp2, mask};
1563               emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1564             }
1565         }
1566       else
1567         /* TODO: We will enable more variable-length vector in the future.  */
1568         gcc_unreachable ();
1569     }
1570   else
1571     gcc_unreachable ();
1572
1573   if (result != target)
1574     emit_move_insn (target, result);
1575 }
1576
1577 /* Get the frm mode with given CONST_INT rtx, the default mode is
1578    FRM_DYN.  */
1579 enum floating_point_rounding_mode
1580 get_frm_mode (rtx operand)
1581 {
1582   gcc_assert (CONST_INT_P (operand));
1583
1584   switch (INTVAL (operand))
1585     {
1586     case FRM_RNE:
1587       return FRM_RNE;
1588     case FRM_RTZ:
1589       return FRM_RTZ;
1590     case FRM_RDN:
1591       return FRM_RDN;
1592     case FRM_RUP:
1593       return FRM_RUP;
1594     case FRM_RMM:
1595       return FRM_RMM;
1596     case FRM_DYN:
1597       return FRM_DYN;
1598     default:
1599       gcc_unreachable ();
1600     }
1601
1602   gcc_unreachable ();
1603 }
1604
1605 /* Expand a pre-RA RVV data move from SRC to DEST.
1606    It expands move for RVV fractional vector modes.
1607    Return true if the move as already been emitted.  */
1608 bool
1609 legitimize_move (rtx dest, rtx *srcp)
1610 {
1611   rtx src = *srcp;
1612   machine_mode mode = GET_MODE (dest);
1613   if (CONST_VECTOR_P (src))
1614     {
1615       expand_const_vector (dest, src);
1616       return true;
1617     }
1618
1619   if (riscv_v_ext_vls_mode_p (mode))
1620     {
1621       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1622         {
1623           /* For NUNITS <= 31 VLS modes, we don't need extract
1624              scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1625           if (can_create_pseudo_p ())
1626             {
1627               /* Need to force register if mem <- !reg.  */
1628               if (MEM_P (dest) && !REG_P (src))
1629                 *srcp = force_reg (mode, src);
1630
1631               return false;
1632             }
1633         }
1634       else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1635         {
1636           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1637           return true;
1638         }
1639     }
1640   else
1641     {
1642       /* In order to decrease the memory traffic, we don't use whole register
1643        * load/store for the LMUL less than 1 and mask mode, so those case will
1644        * require one extra general purpose register, but it's not allowed during
1645        * LRA process, so we have a special move pattern used for LRA, which will
1646        * defer the expansion after LRA.  */
1647       if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1648            || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1649           && lra_in_progress)
1650         {
1651           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1652           return true;
1653         }
1654
1655       if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1656           && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1657         {
1658           /* Need to force register if mem <- !reg.  */
1659           if (MEM_P (dest) && !REG_P (src))
1660             *srcp = force_reg (mode, src);
1661
1662           return false;
1663         }
1664     }
1665
1666   if (register_operand (src, mode) && register_operand (dest, mode))
1667     {
1668       emit_insn (gen_rtx_SET (dest, src));
1669       return true;
1670     }
1671
1672   unsigned insn_flags
1673     = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1674   if (!register_operand (src, mode) && !register_operand (dest, mode))
1675     {
1676       rtx tmp = gen_reg_rtx (mode);
1677       if (MEM_P (src))
1678         {
1679           rtx ops[] = {tmp, src};
1680           emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1681         }
1682       else
1683         emit_move_insn (tmp, src);
1684       src = tmp;
1685     }
1686
1687   if (satisfies_constraint_vu (src))
1688     return false;
1689
1690   rtx ops[] = {dest, src};
1691   emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1692   return true;
1693 }
1694
1695 /* VTYPE information for machine_mode.  */
1696 struct mode_vtype_group
1697 {
1698   enum vlmul_type vlmul[NUM_MACHINE_MODES];
1699   uint8_t ratio[NUM_MACHINE_MODES];
1700   machine_mode subpart_mode[NUM_MACHINE_MODES];
1701   uint8_t nf[NUM_MACHINE_MODES];
1702   mode_vtype_group ()
1703   {
1704 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO)                                 \
1705   vlmul[MODE##mode] = VLMUL;                                                   \
1706   ratio[MODE##mode] = RATIO;
1707 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO)         \
1708   subpart_mode[MODE##mode] = SUBPART_MODE##mode;                               \
1709   nf[MODE##mode] = NF;                                                         \
1710   vlmul[MODE##mode] = VLMUL;                                                   \
1711   ratio[MODE##mode] = RATIO;
1712 #include "riscv-vector-switch.def"
1713 #undef ENTRY
1714 #undef TUPLE_ENTRY
1715   }
1716 };
1717
1718 static mode_vtype_group mode_vtype_infos;
1719
1720 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR.  */
1721 enum vlmul_type
1722 get_vlmul (machine_mode mode)
1723 {
1724   /* For VLS modes, the vlmul should be dynamically
1725      calculated since we need to adjust VLMUL according
1726      to TARGET_MIN_VLEN.  */
1727   if (riscv_v_ext_vls_mode_p (mode))
1728     {
1729       int size = GET_MODE_BITSIZE (mode).to_constant ();
1730       int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1731       if (size < TARGET_MIN_VLEN)
1732         {
1733           int factor = TARGET_MIN_VLEN / size;
1734           if (inner_size == 8)
1735             factor = MIN (factor, 8);
1736           else if (inner_size == 16)
1737             factor = MIN (factor, 4);
1738           else if (inner_size == 32)
1739             factor = MIN (factor, 2);
1740           else if (inner_size == 64)
1741             factor = MIN (factor, 1);
1742           else
1743             gcc_unreachable ();
1744
1745           switch (factor)
1746             {
1747             case 1:
1748               return LMUL_1;
1749             case 2:
1750               return LMUL_F2;
1751             case 4:
1752               return LMUL_F4;
1753             case 8:
1754               return LMUL_F8;
1755
1756             default:
1757               gcc_unreachable ();
1758             }
1759         }
1760       else
1761         {
1762           int factor = size / TARGET_MIN_VLEN;
1763           switch (factor)
1764             {
1765             case 1:
1766               return LMUL_1;
1767             case 2:
1768               return LMUL_2;
1769             case 4:
1770               return LMUL_4;
1771             case 8:
1772               return LMUL_8;
1773
1774             default:
1775               gcc_unreachable ();
1776             }
1777         }
1778     }
1779   return mode_vtype_infos.vlmul[mode];
1780 }
1781
1782 /* Return the VLMAX rtx of vector mode MODE.  */
1783 rtx
1784 get_vlmax_rtx (machine_mode mode)
1785 {
1786   gcc_assert (riscv_v_ext_vector_mode_p (mode));
1787   return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1788 }
1789
1790 /* Return the NF value of the corresponding mode.  */
1791 unsigned int
1792 get_nf (machine_mode mode)
1793 {
1794   /* We don't allow non-tuple modes go through this function.  */
1795   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1796   return mode_vtype_infos.nf[mode];
1797 }
1798
1799 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1800    the subpart mode is RVVM2SImode. This will help to build
1801    array/struct type in builtins.  */
1802 machine_mode
1803 get_subpart_mode (machine_mode mode)
1804 {
1805   /* We don't allow non-tuple modes go through this function.  */
1806   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1807   return mode_vtype_infos.subpart_mode[mode];
1808 }
1809
1810 /* Get ratio according to machine mode.  */
1811 unsigned int
1812 get_ratio (machine_mode mode)
1813 {
1814   if (riscv_v_ext_vls_mode_p (mode))
1815     {
1816       unsigned int sew = get_sew (mode);
1817       vlmul_type vlmul = get_vlmul (mode);
1818       switch (vlmul)
1819         {
1820         case LMUL_1:
1821           return sew;
1822         case LMUL_2:
1823           return sew / 2;
1824         case LMUL_4:
1825           return sew / 4;
1826         case LMUL_8:
1827           return sew / 8;
1828         case LMUL_F8:
1829           return sew * 8;
1830         case LMUL_F4:
1831           return sew * 4;
1832         case LMUL_F2:
1833           return sew * 2;
1834
1835         default:
1836           gcc_unreachable ();
1837         }
1838     }
1839   return mode_vtype_infos.ratio[mode];
1840 }
1841
1842 /* Get ta according to operand[tail_op_idx].  */
1843 int
1844 get_ta (rtx ta)
1845 {
1846   if (INTVAL (ta) == TAIL_ANY)
1847     return INVALID_ATTRIBUTE;
1848   return INTVAL (ta);
1849 }
1850
1851 /* Get ma according to operand[mask_op_idx].  */
1852 int
1853 get_ma (rtx ma)
1854 {
1855   if (INTVAL (ma) == MASK_ANY)
1856     return INVALID_ATTRIBUTE;
1857   return INTVAL (ma);
1858 }
1859
1860 /* Get prefer tail policy.  */
1861 enum tail_policy
1862 get_prefer_tail_policy ()
1863 {
1864   /* TODO: By default, we choose to use TAIL_ANY which allows
1865      compiler pick up either agnostic or undisturbed. Maybe we
1866      will have a compile option like -mprefer=agnostic to set
1867      this value???.  */
1868   return TAIL_ANY;
1869 }
1870
1871 /* Get prefer mask policy.  */
1872 enum mask_policy
1873 get_prefer_mask_policy ()
1874 {
1875   /* TODO: By default, we choose to use MASK_ANY which allows
1876      compiler pick up either agnostic or undisturbed. Maybe we
1877      will have a compile option like -mprefer=agnostic to set
1878      this value???.  */
1879   return MASK_ANY;
1880 }
1881
1882 /* Get avl_type rtx.  */
1883 rtx
1884 get_avl_type_rtx (enum avl_type type)
1885 {
1886   return gen_int_mode (type, Pmode);
1887 }
1888
1889 /* Return the appropriate mask mode for MODE.  */
1890
1891 machine_mode
1892 get_mask_mode (machine_mode mode)
1893 {
1894   poly_int64 nunits = GET_MODE_NUNITS (mode);
1895   if (riscv_v_ext_tuple_mode_p (mode))
1896     {
1897       unsigned int nf = get_nf (mode);
1898       nunits = exact_div (nunits, nf);
1899     }
1900   return get_vector_mode (BImode, nunits).require ();
1901 }
1902
1903 /* Return the appropriate LMUL mode for MODE.  */
1904
1905 opt_machine_mode
1906 get_lmul_mode (scalar_mode mode, int lmul)
1907 {
1908   poly_uint64 lmul_nunits;
1909   unsigned int bytes = GET_MODE_SIZE (mode);
1910   if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits))
1911     return get_vector_mode (mode, lmul_nunits);
1912   return E_VOIDmode;
1913 }
1914
1915 /* Return the appropriate M1 mode for MODE.  */
1916
1917 static opt_machine_mode
1918 get_m1_mode (machine_mode mode)
1919 {
1920   scalar_mode smode = GET_MODE_INNER (mode);
1921   unsigned int bytes = GET_MODE_SIZE (smode);
1922   poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1923   return get_vector_mode (smode, m1_nunits);
1924 }
1925
1926 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1927    This function is not only used by builtins, but also will be used by
1928    auto-vectorization in the future.  */
1929 opt_machine_mode
1930 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1931 {
1932   enum mode_class mclass;
1933   if (inner_mode == E_BImode)
1934     mclass = MODE_VECTOR_BOOL;
1935   else if (FLOAT_MODE_P (inner_mode))
1936     mclass = MODE_VECTOR_FLOAT;
1937   else
1938     mclass = MODE_VECTOR_INT;
1939   machine_mode mode;
1940   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1941     if (inner_mode == GET_MODE_INNER (mode)
1942         && known_eq (nunits, GET_MODE_NUNITS (mode))
1943         && (riscv_v_ext_vector_mode_p (mode)
1944             || riscv_v_ext_vls_mode_p (mode)))
1945       return mode;
1946   return opt_machine_mode ();
1947 }
1948
1949 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1950    corresponding subpart mode and NF.  */
1951 opt_machine_mode
1952 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1953 {
1954   poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1955   scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1956   enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1957   machine_mode mode;
1958   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1959     if (inner_mode == GET_MODE_INNER (mode)
1960         && known_eq (nunits, GET_MODE_NUNITS (mode))
1961         && riscv_v_ext_tuple_mode_p (mode)
1962         && get_subpart_mode (mode) == subpart_mode)
1963       return mode;
1964   return opt_machine_mode ();
1965 }
1966
1967 bool
1968 simm5_p (rtx x)
1969 {
1970   if (!CONST_INT_P (x))
1971     return false;
1972   return IN_RANGE (INTVAL (x), -16, 15);
1973 }
1974
1975 bool
1976 neg_simm5_p (rtx x)
1977 {
1978   if (!CONST_INT_P (x))
1979     return false;
1980   return IN_RANGE (INTVAL (x), -15, 16);
1981 }
1982
1983 bool
1984 has_vi_variant_p (rtx_code code, rtx x)
1985 {
1986   switch (code)
1987     {
1988     case PLUS:
1989     case AND:
1990     case IOR:
1991     case XOR:
1992     case SS_PLUS:
1993     case US_PLUS:
1994     case EQ:
1995     case NE:
1996     case LE:
1997     case LEU:
1998     case GT:
1999     case GTU:
2000       return simm5_p (x);
2001
2002     case LT:
2003     case LTU:
2004     case GE:
2005     case GEU:
2006     case MINUS:
2007     case SS_MINUS:
2008       return neg_simm5_p (x);
2009
2010     default:
2011       return false;
2012     }
2013 }
2014
2015 bool
2016 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
2017                      machine_mode vector_mode, bool has_vi_variant_p,
2018                      void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
2019 {
2020   machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
2021   if (has_vi_variant_p)
2022     {
2023       *scalar_op = force_reg (scalar_mode, *scalar_op);
2024       return false;
2025     }
2026
2027   if (TARGET_64BIT)
2028     {
2029       if (!rtx_equal_p (*scalar_op, const0_rtx))
2030         *scalar_op = force_reg (scalar_mode, *scalar_op);
2031       return false;
2032     }
2033
2034   if (immediate_operand (*scalar_op, Pmode))
2035     {
2036       if (!rtx_equal_p (*scalar_op, const0_rtx))
2037         *scalar_op = force_reg (Pmode, *scalar_op);
2038
2039       *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
2040       return false;
2041     }
2042
2043   if (CONST_INT_P (*scalar_op))
2044     {
2045       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
2046         *scalar_op = force_const_mem (scalar_mode, *scalar_op);
2047       else
2048         *scalar_op = force_reg (scalar_mode, *scalar_op);
2049     }
2050
2051   rtx tmp = gen_reg_rtx (vector_mode);
2052   rtx ops[] = {tmp, *scalar_op};
2053   if (type == VLMAX)
2054     emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
2055   else
2056     emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
2057                         vl);
2058   emit_vector_func (operands, tmp);
2059
2060   return true;
2061 }
2062
2063 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask.  */
2064 rtx
2065 gen_scalar_move_mask (machine_mode mode)
2066 {
2067   rtx_vector_builder builder (mode, 1, 2);
2068   builder.quick_push (const1_rtx);
2069   builder.quick_push (const0_rtx);
2070   return builder.build ();
2071 }
2072
2073 static unsigned
2074 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
2075 {
2076   // Original equation:
2077   //   VLMAX = (VectorBits / EltSize) * LMUL
2078   //   where LMUL = MinSize / TARGET_MIN_VLEN
2079   // The following equations have been reordered to prevent loss of precision
2080   // when calculating fractional LMUL.
2081   return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
2082 }
2083
2084 static unsigned
2085 get_unknown_min_value (machine_mode mode)
2086 {
2087   enum vlmul_type vlmul = get_vlmul (mode);
2088   switch (vlmul)
2089     {
2090     case LMUL_1:
2091       return TARGET_MIN_VLEN;
2092     case LMUL_2:
2093       return TARGET_MIN_VLEN * 2;
2094     case LMUL_4:
2095       return TARGET_MIN_VLEN * 4;
2096     case LMUL_8:
2097       return TARGET_MIN_VLEN * 8;
2098     default:
2099       gcc_unreachable ();
2100     }
2101 }
2102
2103 static rtx
2104 force_vector_length_operand (rtx vl)
2105 {
2106   if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2107     return force_reg (Pmode, vl);
2108   return vl;
2109 }
2110
2111 rtx
2112 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2113 {
2114   unsigned int sew = get_sew (vmode);
2115   rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2116   rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2117   return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2118                                      gen_int_mode (get_vlmul (vmode), Pmode),
2119                                      tail_policy, mask_policy);
2120 }
2121
2122 /* GET VL * 2 rtx.  */
2123 static rtx
2124 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2125 {
2126   rtx i32vl = NULL_RTX;
2127   if (CONST_INT_P (avl))
2128     {
2129       unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2130       unsigned min_size = get_unknown_min_value (mode);
2131       unsigned vlen_max = RVV_65536;
2132       unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2133       unsigned vlen_min = TARGET_MIN_VLEN;
2134       unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2135
2136       unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2137       if (avl_int <= vlmax_min)
2138         i32vl = gen_int_mode (2 * avl_int, Pmode);
2139       else if (avl_int >= 2 * vlmax_max)
2140         {
2141           // Just set i32vl to VLMAX in this situation
2142           i32vl = gen_reg_rtx (Pmode);
2143           emit_insn (
2144             gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2145         }
2146       else
2147         {
2148           // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2149           // is related to the hardware implementation.
2150           // So let the following code handle
2151         }
2152     }
2153   if (!i32vl)
2154     {
2155       // Using vsetvli instruction to get actually used length which related to
2156       // the hardware implementation
2157       rtx i64vl = gen_reg_rtx (Pmode);
2158       emit_insn (
2159         gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2160       // scale 2 for 32-bit length
2161       i32vl = gen_reg_rtx (Pmode);
2162       emit_insn (
2163         gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2164     }
2165
2166   return force_vector_length_operand (i32vl);
2167 }
2168
2169 bool
2170 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2171                      machine_mode demote_mask_mode, rtx *ops)
2172 {
2173   rtx scalar_op = ops[4];
2174   rtx avl = ops[5];
2175   machine_mode scalar_mode = GET_MODE_INNER (mode);
2176   if (rtx_equal_p (scalar_op, const0_rtx))
2177     {
2178       ops[5] = force_vector_length_operand (ops[5]);
2179       return false;
2180     }
2181
2182   if (TARGET_64BIT)
2183     {
2184       ops[4] = force_reg (scalar_mode, scalar_op);
2185       ops[5] = force_vector_length_operand (ops[5]);
2186       return false;
2187     }
2188
2189   if (immediate_operand (scalar_op, Pmode))
2190     {
2191       ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2192       ops[5] = force_vector_length_operand (ops[5]);
2193       return false;
2194     }
2195
2196   if (CONST_INT_P (scalar_op))
2197     scalar_op = force_reg (scalar_mode, scalar_op);
2198
2199   rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2200
2201   rtx demote_scalar_op1, demote_scalar_op2;
2202   if (unspec == UNSPEC_VSLIDE1UP)
2203     {
2204       demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2205       demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2206     }
2207   else
2208     {
2209       demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2210       demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2211     }
2212
2213   rtx temp = gen_reg_rtx (demote_mode);
2214   rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2215   rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2216   rtx merge = RVV_VUNDEF (demote_mode);
2217   /* Handle vslide1<ud>_tu.  */
2218   if (register_operand (ops[2], mode)
2219       && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2220     {
2221       merge = gen_lowpart (demote_mode, ops[2]);
2222       ta = ops[6];
2223       ma = ops[7];
2224     }
2225
2226   emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2227                              CONSTM1_RTX (demote_mask_mode), merge,
2228                              gen_lowpart (demote_mode, ops[3]),
2229                              demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2230   emit_insn (gen_pred_slide (unspec, demote_mode,
2231                              gen_lowpart (demote_mode, ops[0]),
2232                              CONSTM1_RTX (demote_mask_mode), merge, temp,
2233                              demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2234
2235   if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2236       && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2237     emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2238                                force_vector_length_operand (ops[5]), ops[6],
2239                                ops[8]));
2240   return true;
2241 }
2242
2243 rtx
2244 gen_avl_for_scalar_move (rtx avl)
2245 {
2246   /* AVL for scalar move has different behavior between 0 and large than 0.  */
2247   if (CONST_INT_P (avl))
2248     {
2249       /* So we could just set AVL to 1 for any constant other than 0.  */
2250       if (rtx_equal_p (avl, const0_rtx))
2251         return const0_rtx;
2252       else
2253         return const1_rtx;
2254     }
2255   else
2256     {
2257       /* For non-constant value, we set any non zero value to 1 by
2258          `sgtu new_avl,input_avl,zero` + `vsetvli`.  */
2259       rtx tmp = gen_reg_rtx (Pmode);
2260       emit_insn (
2261         gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2262       return tmp;
2263     }
2264 }
2265
2266 /* Expand tuple modes data movement for.  */
2267 void
2268 expand_tuple_move (rtx *ops)
2269 {
2270   unsigned int i;
2271   machine_mode tuple_mode = GET_MODE (ops[0]);
2272   machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2273   poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2274   unsigned int nf = get_nf (tuple_mode);
2275   bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2276
2277   if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2278     {
2279       rtx val;
2280       gcc_assert (can_create_pseudo_p ()
2281                   && const_vec_duplicate_p (ops[1], &val));
2282       for (i = 0; i < nf; ++i)
2283         {
2284           poly_int64 offset = i * subpart_size;
2285           rtx subreg
2286             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2287           rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2288           emit_move_insn (subreg, dup);
2289         }
2290     }
2291   else if (REG_P (ops[0]) && REG_P (ops[1]))
2292     {
2293       for (i = 0; i < nf; ++i)
2294         {
2295           int index = i;
2296
2297           /* Take NF = 2 and LMUL = 1 for example:
2298
2299               - move v8 to v9:
2300                  vmv1r v10,v9
2301                  vmv1r v9,v8
2302
2303               - move v8 to v7:
2304                  vmv1r v7,v8
2305                  vmv1r v8,v9  */
2306           if (REGNO (ops[0]) > REGNO (ops[1]))
2307             index = nf - 1 - i;
2308           poly_int64 offset = index * subpart_size;
2309           rtx dst_subreg
2310             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2311           rtx src_subreg
2312             = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2313           emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2314         }
2315     }
2316   else
2317     {
2318       /* Expand tuple memory data movement.  */
2319       gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2320       rtx offset = gen_int_mode (subpart_size, Pmode);
2321       if (!subpart_size.is_constant ())
2322         {
2323           emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2324           if (fractional_p)
2325             {
2326               unsigned int factor
2327                 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2328                     .to_constant ();
2329               rtx pat
2330                 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2331                                     gen_int_mode (exact_log2 (factor), Pmode));
2332               emit_insn (gen_rtx_SET (ops[2], pat));
2333             }
2334
2335           if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2336             {
2337               unsigned int factor
2338                 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2339                     .to_constant ();
2340               rtx pat
2341                 = gen_rtx_ASHIFT (Pmode, ops[2],
2342                                   gen_int_mode (exact_log2 (factor), Pmode));
2343               emit_insn (gen_rtx_SET (ops[2], pat));
2344             }
2345           offset = ops[2];
2346         }
2347
2348       /* Non-fractional LMUL has whole register moves that don't require a
2349          vsetvl for VLMAX.  */
2350       if (fractional_p)
2351         emit_vlmax_vsetvl (subpart_mode, ops[4]);
2352       if (MEM_P (ops[1]))
2353         {
2354           /* Load operations.  */
2355           emit_move_insn (ops[3], XEXP (ops[1], 0));
2356           for (i = 0; i < nf; i++)
2357             {
2358               rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2359                                                 tuple_mode, i * subpart_size);
2360               if (i != 0)
2361                 {
2362                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2363                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2364                 }
2365               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2366
2367               if (fractional_p)
2368                 {
2369                   rtx operands[] = {subreg, mem};
2370                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2371                                         UNARY_OP, operands, ops[4]);
2372                 }
2373               else
2374                 emit_move_insn (subreg, mem);
2375             }
2376         }
2377       else
2378         {
2379           /* Store operations.  */
2380           emit_move_insn (ops[3], XEXP (ops[0], 0));
2381           for (i = 0; i < nf; i++)
2382             {
2383               rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2384                                                 tuple_mode, i * subpart_size);
2385               if (i != 0)
2386                 {
2387                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2388                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2389                 }
2390               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2391
2392               if (fractional_p)
2393                 {
2394                   rtx operands[] = {mem, subreg};
2395                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2396                                         UNARY_OP, operands, ops[4]);
2397                 }
2398               else
2399                 emit_move_insn (mem, subreg);
2400             }
2401         }
2402     }
2403 }
2404
2405 /* Return the vectorization machine mode for RVV according to LMUL.  */
2406 machine_mode
2407 preferred_simd_mode (scalar_mode mode)
2408 {
2409   if (autovec_use_vlmax_p ())
2410     {
2411       /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2412          rvv_max_lmul as multiply factor to calculate the NUNITS to
2413          get the auto-vectorization mode.  */
2414       poly_uint64 nunits;
2415       poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2416       poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2417       /* Disable vectorization when we can't find a RVV mode for it.
2418          E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2419          a double (DFmode) type.  */
2420       if (!multiple_p (vector_size, scalar_size, &nunits))
2421         return word_mode;
2422       machine_mode rvv_mode;
2423       if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2424         return rvv_mode;
2425     }
2426   return word_mode;
2427 }
2428
2429 /* Use merge approach to initialize the vector with repeating sequence.
2430    v = {a, b, a, b, a, b, a, b}.
2431
2432    v = broadcast (a).
2433    mask = 0b01010101....
2434    v = merge (v, b, mask)
2435 */
2436 static void
2437 expand_vector_init_merge_repeating_sequence (rtx target,
2438                                              const rvv_builder &builder)
2439 {
2440   /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2441      since we don't have such instruction in RVV.
2442      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2443      instruction to generate the mask data we want.  */
2444   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2445   machine_mode mask_int_mode
2446     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2447   uint64_t full_nelts = builder.full_nelts ().to_constant ();
2448
2449   /* Step 1: Broadcast the first pattern.  */
2450   rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2451   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2452                     UNARY_OP, ops);
2453   /* Step 2: Merge the rest iteration of pattern.  */
2454   for (unsigned int i = 1; i < builder.npatterns (); i++)
2455     {
2456       /* Step 2-1: Generate mask register v0 for each merge.  */
2457       rtx merge_mask
2458         = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2459       rtx mask = gen_reg_rtx (mask_bit_mode);
2460       rtx dup = gen_reg_rtx (mask_int_mode);
2461
2462       if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
2463         {
2464           rtx ops[] = {dup, merge_mask};
2465           emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2466                                SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2467         }
2468       else /* vmv.v.x.  */
2469         {
2470           rtx ops[] = {dup,
2471                        force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2472           rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2473                                  Pmode);
2474           emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2475                                ops, vl);
2476         }
2477
2478       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2479
2480       /* Step 2-2: Merge pattern according to the mask.  */
2481       rtx ops[] = {target, target, builder.elt (i), mask};
2482       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2483                         MERGE_OP, ops);
2484     }
2485 }
2486
2487 /* Use slideup approach to combine the vectors.
2488      v = {a, a, a, a, b, b, b, b}
2489
2490    First:
2491      v1 = {a, a, a, a, a, a, a, a}
2492      v2 = {b, b, b, b, b, b, b, b}
2493      v = slideup (v1, v2, nelt / 2)
2494 */
2495 static void
2496 expand_vector_init_slideup_combine_sequence (rtx target,
2497                                              const rvv_builder &builder)
2498 {
2499   machine_mode mode = GET_MODE (target);
2500   int nelts = builder.full_nelts ().to_constant ();
2501   rtx first_elt = builder.elt (0);
2502   rtx last_elt = builder.elt (nelts - 1);
2503   rtx low = expand_vector_broadcast (mode, first_elt);
2504   rtx high = expand_vector_broadcast (mode, last_elt);
2505   insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2506   rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2507   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2508 }
2509
2510 /* Use merge approach to merge a scalar into a vector.
2511      v = {a, a, a, a, a, a, b, b}
2512
2513      v1 = {a, a, a, a, a, a, a, a}
2514      scalar = b
2515      mask = {0, 0, 0, 0, 0, 0, 1, 1}
2516 */
2517 static void
2518 expand_vector_init_merge_combine_sequence (rtx target,
2519                                            const rvv_builder &builder)
2520 {
2521   machine_mode mode = GET_MODE (target);
2522   machine_mode imode = builder.int_mode ();
2523   machine_mode mmode = builder.mask_mode ();
2524   int nelts = builder.full_nelts ().to_constant ();
2525   int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2526   if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2527       || riscv_get_v_regno_alignment (imode) > 1)
2528     imode = get_vector_mode (HImode, nelts).require ();
2529
2530   /* Generate vid = { 0, 1, 2, ..., n }.  */
2531   rtx vid = gen_reg_rtx (imode);
2532   expand_vec_series (vid, const0_rtx, const1_rtx);
2533
2534   /* Generate mask.  */
2535   rtx mask = gen_reg_rtx (mmode);
2536   insn_code icode = code_for_pred_cmp_scalar (imode);
2537   rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2538   rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2539   /* vmsgtu.vi/vmsgtu.vx.  */
2540   rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2541   rtx sel = builder.elt (nelts - 1);
2542   rtx mask_ops[] = {mask, cmp, vid, index};
2543   emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2544
2545   /* Duplicate the first elements.  */
2546   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2547   /* Merge scalar into vector according to mask.  */
2548   rtx merge_ops[] = {target, dup, sel, mask};
2549   icode = code_for_pred_merge_scalar (mode);
2550   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2551 }
2552
2553 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
2554
2555 void
2556 expand_vec_init (rtx target, rtx vals)
2557 {
2558   machine_mode mode = GET_MODE (target);
2559   int nelts = XVECLEN (vals, 0);
2560
2561   rvv_builder v (mode, nelts, 1);
2562   for (int i = 0; i < nelts; i++)
2563     v.quick_push (XVECEXP (vals, 0, i));
2564   v.finalize ();
2565
2566   /* If the sequence is v = { a, a, a, a } just broadcast an element.  */
2567   if (v.is_repeating_sequence ())
2568     {
2569       machine_mode mode = GET_MODE (target);
2570       rtx dup = expand_vector_broadcast (mode, v.elt (0));
2571       emit_move_insn (target, dup);
2572       return;
2573     }
2574
2575   if (nelts > 3)
2576     {
2577       /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }.  */
2578       if (v.can_duplicate_repeating_sequence_p ())
2579         {
2580           rtx ele = v.get_merged_repeating_sequence ();
2581           rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2582           emit_move_insn (target, gen_lowpart (mode, dup));
2583           return;
2584         }
2585
2586       /* Case 2: Optimize repeating sequence cases that Case 1 can
2587          not handle and it is profitable.  For example:
2588          ELEMENT BITSIZE = 64.
2589          v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2590          We can't find a vector mode for "ab" which will be combined into
2591          128-bit element to duplicate.  */
2592       if (v.repeating_sequence_use_merge_profitable_p ())
2593         {
2594           expand_vector_init_merge_repeating_sequence (target, v);
2595           return;
2596         }
2597
2598       /* Case 3: Optimize combine sequence.
2599          E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2600          We can combine:
2601            v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2602          and
2603            v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2604          by slideup.  */
2605       if (v.combine_sequence_use_slideup_profitable_p ())
2606         {
2607           expand_vector_init_slideup_combine_sequence (target, v);
2608           return;
2609         }
2610
2611       /* Case 4: Optimize combine sequence.
2612          E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2613
2614          Generate vector:
2615            v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2616
2617          Generate mask:
2618            mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2619
2620          Merge b into v by mask:
2621            v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.  */
2622       if (v.combine_sequence_use_merge_profitable_p ())
2623         {
2624           expand_vector_init_merge_combine_sequence (target, v);
2625           return;
2626         }
2627     }
2628
2629   /* Optimize trailing same elements sequence:
2630       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
2631   if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2632     /* Handle common situation by vslide1down. This function can handle any
2633        situation of vec_init<mode>. Only the cases that are not optimized above
2634        will fall through here.  */
2635     expand_vector_init_insert_elems (target, v, nelts);
2636 }
2637
2638 /* Get insn code for corresponding comparison.  */
2639
2640 static insn_code
2641 get_cmp_insn_code (rtx_code code, machine_mode mode)
2642 {
2643   insn_code icode;
2644   switch (code)
2645     {
2646     case EQ:
2647     case NE:
2648     case LE:
2649     case LEU:
2650     case GT:
2651     case GTU:
2652     case LTGT:
2653       icode = code_for_pred_cmp (mode);
2654       break;
2655     case LT:
2656     case LTU:
2657     case GE:
2658     case GEU:
2659       if (FLOAT_MODE_P (mode))
2660         icode = code_for_pred_cmp (mode);
2661       else
2662         icode = code_for_pred_ltge (mode);
2663       break;
2664     default:
2665       gcc_unreachable ();
2666     }
2667   return icode;
2668 }
2669
2670 /* This hook gives the vectorizer more vector mode options.  We want it to not
2671    only try modes with the maximum number of units a full vector can hold but
2672    for example also half the number of units for a smaller elements size.
2673    Such vectors can be promoted to a full vector of widened elements
2674    (still with the same number of elements, essentially vectorizing at a
2675    fixed number of units rather than a fixed number of bytes).  */
2676 unsigned int
2677 autovectorize_vector_modes (vector_modes *modes, bool)
2678 {
2679   if (autovec_use_vlmax_p ())
2680     {
2681       poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2682
2683       /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2684          fit a whole vector.
2685          Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2686          is guided by the extensions we have available (vf2, vf4 and vf8).
2687
2688          - full_size: Try using full vectors for all element types.
2689          - full_size / 2:
2690            Try using 16-bit containers for 8-bit elements and full vectors
2691            for wider elements.
2692          - full_size / 4:
2693            Try using 32-bit containers for 8-bit and 16-bit elements and
2694            full vectors for wider elements.
2695          - full_size / 8:
2696            Try using 64-bit containers for all element types.  */
2697       static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2698       for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2699         {
2700           poly_uint64 units;
2701           machine_mode mode;
2702           if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2703               && get_vector_mode (QImode, units).exists (&mode))
2704             modes->safe_push (mode);
2705         }
2706     }
2707     /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
2708     unsigned int i = 0;
2709     unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2710     unsigned int size = base_size;
2711     machine_mode mode;
2712     while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2713      {
2714         if (vls_mode_valid_p (mode))
2715           modes->safe_push (mode);
2716
2717         i++;
2718         size = base_size / (1U << i);
2719      }
2720   /* Enable LOOP_VINFO comparison in COST model.  */
2721   return VECT_COMPARE_COSTS;
2722 }
2723
2724 /* Return true if we can find the related MODE according to default LMUL. */
2725 static bool
2726 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2727                          poly_uint64 *nunits)
2728 {
2729   if (!autovec_use_vlmax_p ())
2730     return false;
2731   if (riscv_v_ext_vector_mode_p (vector_mode)
2732       && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2733                      GET_MODE_SIZE (element_mode), nunits))
2734     return true;
2735   if (riscv_v_ext_vls_mode_p (vector_mode)
2736       && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2737                      GET_MODE_SIZE (element_mode), nunits))
2738     return true;
2739   return false;
2740 }
2741
2742 /* If the given VECTOR_MODE is an RVV mode,  first get the largest number
2743    of units that fit into a full vector at the given ELEMENT_MODE.
2744    We will have the vectorizer call us with a successively decreasing
2745    number of units (as specified in autovectorize_vector_modes).
2746    The starting mode is always the one specified by preferred_simd_mode. */
2747 opt_machine_mode
2748 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2749                         poly_uint64 nunits)
2750 {
2751   /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2752   poly_uint64 min_units;
2753   if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2754     {
2755       machine_mode rvv_mode;
2756       if (maybe_ne (nunits, 0U))
2757         {
2758           /* If we were given a number of units NUNITS, try to find an
2759              RVV vector mode of inner mode ELEMENT_MODE with the same
2760              number of units.  */
2761           if (multiple_p (min_units, nunits)
2762               && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2763             return rvv_mode;
2764         }
2765       else
2766         {
2767           /* Look for a vector mode with the same number of units as the
2768              VECTOR_MODE we were given.  We keep track of the minimum
2769              number of units so far which determines the smallest necessary
2770              but largest possible, suitable mode for vectorization.  */
2771           min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2772           if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2773             return rvv_mode;
2774         }
2775     }
2776
2777   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2778 }
2779
2780 /* Expand an RVV comparison.  */
2781
2782 void
2783 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask,
2784                 rtx maskoff)
2785 {
2786   machine_mode mask_mode = GET_MODE (target);
2787   machine_mode data_mode = GET_MODE (op0);
2788   insn_code icode = get_cmp_insn_code (code, data_mode);
2789
2790   if (code == LTGT)
2791     {
2792       rtx lt = gen_reg_rtx (mask_mode);
2793       rtx gt = gen_reg_rtx (mask_mode);
2794       expand_vec_cmp (lt, LT, op0, op1, mask, maskoff);
2795       expand_vec_cmp (gt, GT, op0, op1, mask, maskoff);
2796       icode = code_for_pred (IOR, mask_mode);
2797       rtx ops[] = {target, lt, gt};
2798       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2799       return;
2800     }
2801
2802   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2803   if (!mask && !maskoff)
2804     {
2805       rtx ops[] = {target, cmp, op0, op1};
2806       emit_vlmax_insn (icode, COMPARE_OP, ops);
2807     }
2808   else
2809     {
2810       rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2811       emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2812     }
2813 }
2814
2815 /* Expand an RVV floating-point comparison:
2816
2817    If CAN_INVERT_P is true, the caller can also handle inverted results;
2818    return true if the result is in fact inverted.  */
2819
2820 bool
2821 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2822                       bool can_invert_p)
2823 {
2824   machine_mode mask_mode = GET_MODE (target);
2825   machine_mode data_mode = GET_MODE (op0);
2826
2827   /* If can_invert_p = true:
2828      It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2829
2830        vmfeq.vv    v0, va, va
2831        vmfeq.vv    v1, vb, vb
2832        vmand.mm    v0, v0, v1
2833        vmflt.vv    v0, va, vb, v0.t
2834        vmnot.m     v0, v0
2835
2836      And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2837      second vmfeq.vv:
2838
2839        vmfeq.vv    v0, va, va
2840        vmfeq.vv    v0, vb, vb, v0.t
2841        vmflt.vv    v0, va, vb, v0.t
2842        vmnot.m     v0, v0
2843
2844      If can_invert_p = false:
2845
2846        # Example of implementing isgreater()
2847        vmfeq.vv v0, va, va        # Only set where A is not NaN.
2848        vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
2849        vmand.mm v0, v0, v1        # Only set where A and B are ordered,
2850        vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
2851   */
2852
2853   rtx eq0 = gen_reg_rtx (mask_mode);
2854   rtx eq1 = gen_reg_rtx (mask_mode);
2855   switch (code)
2856     {
2857     case EQ:
2858     case NE:
2859     case LT:
2860     case LE:
2861     case GT:
2862     case GE:
2863     case LTGT:
2864       /* There is native support for the comparison.  */
2865       expand_vec_cmp (target, code, op0, op1);
2866       return false;
2867     case UNEQ:
2868     case ORDERED:
2869     case UNORDERED:
2870     case UNLT:
2871     case UNLE:
2872     case UNGT:
2873     case UNGE:
2874       /* vmfeq.vv v0, va, va  */
2875       expand_vec_cmp (eq0, EQ, op0, op0);
2876       if (HONOR_SNANS (data_mode))
2877         {
2878           /*
2879              vmfeq.vv    v1, vb, vb
2880              vmand.mm    v0, v0, v1
2881           */
2882           expand_vec_cmp (eq1, EQ, op1, op1);
2883           insn_code icode = code_for_pred (AND, mask_mode);
2884           rtx ops[] = {eq0, eq0, eq1};
2885           emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2886         }
2887       else
2888         {
2889           /* vmfeq.vv    v0, vb, vb, v0.t  */
2890           expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0);
2891         }
2892       break;
2893     default:
2894       gcc_unreachable ();
2895     }
2896
2897   if (code == ORDERED)
2898     {
2899       emit_move_insn (target, eq0);
2900       return false;
2901     }
2902
2903   /* There is native support for the inverse comparison.  */
2904   code = reverse_condition_maybe_unordered (code);
2905   if (code == ORDERED)
2906     emit_move_insn (target, eq0);
2907   else
2908     expand_vec_cmp (eq0, code, op0, op1, eq0, eq0);
2909
2910   if (can_invert_p)
2911     {
2912       emit_move_insn (target, eq0);
2913       return true;
2914     }
2915
2916   /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2917      into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm.  */
2918   emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2919   return false;
2920 }
2921
2922 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2923    MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2924    2 * nunits - 1.  */
2925 static rtx
2926 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2927 {
2928   rtx sel_mod;
2929   machine_mode sel_mode = GET_MODE (sel);
2930   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2931   poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2932   /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2933      Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2934      indice.  */
2935   if (CONST_VECTOR_P (sel)
2936       && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2937     sel_mod = sel;
2938   else
2939     {
2940       rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2941       sel_mod
2942         = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2943     }
2944   return sel_mod;
2945 }
2946
2947 /* Implement vec_perm<mode>.  */
2948
2949 void
2950 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2951 {
2952   machine_mode data_mode = GET_MODE (target);
2953   machine_mode sel_mode = GET_MODE (sel);
2954   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2955
2956   /* Check if the sel only references the first values vector. If each select
2957      index is in range of [0, nunits - 1]. A single vrgather instructions is
2958      enough. Since we will use vrgatherei16.vv for variable-length vector,
2959      it is never out of range and we don't need to modulo the index.  */
2960   if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2961     {
2962       emit_vlmax_gather_insn (target, op0, sel);
2963       return;
2964     }
2965
2966   /* Check if all the indices are same.  */
2967   rtx elt;
2968   if (const_vec_duplicate_p (sel, &elt))
2969     {
2970       poly_uint64 value = rtx_to_poly_int64 (elt);
2971       rtx op = op0;
2972       if (maybe_gt (value, nunits - 1))
2973         {
2974           sel = gen_const_vector_dup (sel_mode, value - nunits);
2975           op = op1;
2976         }
2977       emit_vlmax_gather_insn (target, op, sel);
2978     }
2979
2980   /* Note: vec_perm indices are supposed to wrap when they go beyond the
2981      size of the two value vectors, i.e. the upper bits of the indices
2982      are effectively ignored.  RVV vrgather instead produces 0 for any
2983      out-of-range indices, so we need to modulo all the vec_perm indices
2984      to ensure they are all in range of [0, nunits - 1] when op0 == op1
2985      or all in range of [0, 2 * nunits - 1] when op0 != op1.  */
2986   rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2987
2988   /* Check if the two values vectors are the same.  */
2989   if (rtx_equal_p (op0, op1))
2990     {
2991       emit_vlmax_gather_insn (target, op0, sel_mod);
2992       return;
2993     }
2994
2995   /* This following sequence is handling the case that:
2996      __builtin_shufflevector (vec1, vec2, index...), the index can be any
2997      value in range of [0, 2 * nunits - 1].  */
2998   machine_mode mask_mode;
2999   mask_mode = get_mask_mode (data_mode);
3000   rtx mask = gen_reg_rtx (mask_mode);
3001   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
3002
3003   /* Step 1: generate a mask that should select everything >= nunits into the
3004    * mask.  */
3005   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
3006
3007   /* Step2: gather every op0 values indexed by sel into target,
3008             we don't need to care about the result of the element
3009             whose index >= nunits.  */
3010   emit_vlmax_gather_insn (target, op0, sel_mod);
3011
3012   /* Step3: shift the range from (nunits, max_of_mode] to
3013             [0, max_of_mode - nunits].  */
3014   rtx tmp = gen_reg_rtx (sel_mode);
3015   rtx ops[] = {tmp, sel_mod, max_sel};
3016   emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
3017
3018   /* Step4: gather those into the previously masked-out elements
3019             of target.  */
3020   emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
3021 }
3022
3023 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
3024
3025 /* vec_perm support.  */
3026
3027 struct expand_vec_perm_d
3028 {
3029   rtx target, op0, op1;
3030   vec_perm_indices perm;
3031   machine_mode vmode;
3032   machine_mode op_mode;
3033   bool one_vector_p;
3034   bool testing_p;
3035 };
3036
3037 /* Return the appropriate index mode for gather instructions.  */
3038 opt_machine_mode
3039 get_gather_index_mode (struct expand_vec_perm_d *d)
3040 {
3041   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3042   poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3043
3044   if (GET_MODE_INNER (d->vmode) == QImode)
3045     {
3046       if (nunits.is_constant ())
3047         {
3048           /* If indice is LMUL8 CONST_VECTOR and any element value
3049              exceed the range of 0 ~ 255, Forbid such permutation
3050              since we need vector HI mode to hold such indice and
3051              we don't have it.  */
3052           if (!d->perm.all_in_range_p (0, 255)
3053               && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3054             return opt_machine_mode ();
3055         }
3056       else
3057         {
3058           /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3059              Otherwise, it could overflow the index range.  */
3060           if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3061             return opt_machine_mode ();
3062         }
3063     }
3064   else if (riscv_get_v_regno_alignment (sel_mode) > 1
3065            && GET_MODE_INNER (sel_mode) != HImode)
3066     sel_mode = get_vector_mode (HImode, nunits).require ();
3067   return sel_mode;
3068 }
3069
3070 /* Recognize the patterns that we can use merge operation to shuffle the
3071    vectors. The value of Each element (index i) in selector can only be
3072    either i or nunits + i.  We will check the pattern is actually monotonic.
3073
3074    E.g.
3075    v = VEC_PERM_EXPR (v0, v1, selector),
3076    selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ...  }
3077
3078    We can transform such pattern into:
3079
3080    v = vcond_mask (v0, v1, mask),
3081    mask = { 0, 1, 0, 1, 0, 1, ... }.  */
3082
3083 static bool
3084 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3085 {
3086   machine_mode vmode = d->vmode;
3087   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3088   int n_patterns = d->perm.encoding ().npatterns ();
3089   poly_int64 vec_len = d->perm.length ();
3090
3091   for (int i = 0; i < n_patterns; ++i)
3092     if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3093       return false;
3094
3095   /* Check the pattern is monotonic here, otherwise, return false.  */
3096   for (int i = n_patterns; i < n_patterns * 2; i++)
3097     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3098         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3099       return false;
3100
3101   /* We need to use precomputed mask for such situation and such mask
3102      can only be computed in compile-time known size modes.  */
3103   bool indices_fit_selector_p
3104     = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3105   if (!indices_fit_selector_p && !vec_len.is_constant ())
3106     return false;
3107
3108   if (d->testing_p)
3109     return true;
3110
3111   machine_mode mask_mode = get_mask_mode (vmode);
3112   rtx mask = gen_reg_rtx (mask_mode);
3113
3114   if (indices_fit_selector_p && vec_len.is_constant ())
3115     {
3116       /* For a constant vector length we can generate the needed mask at
3117          compile time and load it as mask at runtime.
3118          This saves a compare at runtime.  */
3119       rtx_vector_builder sel (mask_mode, d->perm.encoding ().npatterns (),
3120                               d->perm.encoding ().nelts_per_pattern ());
3121       unsigned int encoded_nelts = sel.encoded_nelts ();
3122       for (unsigned int i = 0; i < encoded_nelts; i++)
3123         sel.quick_push (gen_int_mode (d->perm[i].to_constant ()
3124                                       < vec_len.to_constant (),
3125                                       GET_MODE_INNER (mask_mode)));
3126       mask = sel.build ();
3127     }
3128   else if (indices_fit_selector_p)
3129     {
3130       /* For a dynamic vector length < 256 we keep the permutation
3131          indices in the literal pool, load it at runtime and create the
3132          mask by selecting either OP0 or OP1 by
3133
3134             INDICES < NUNITS ? 1 : 0.  */
3135       rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3136       rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3137       insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3138       rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3139       rtx ops[] = {mask, cmp, sel, x};
3140       emit_vlmax_insn (icode, COMPARE_OP, ops);
3141     }
3142   else
3143     {
3144       /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3145          directly to generate the selector mask, instead, we can only use
3146          precomputed mask.
3147
3148          E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3149          don't have a QImode scalar register to hold larger than 255.
3150          We also cannot hold that in a vector QImode register if LMUL = 8, and,
3151          since there is no larger HI mode vector we cannot create a larger
3152          selector.
3153
3154          As the mask is a simple {0, 1, ...} pattern and the length is known we
3155          can store it in a scalar register and broadcast it to a mask register.
3156        */
3157       gcc_assert (vec_len.is_constant ());
3158       int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3159       machine_mode mode = get_vector_mode (QImode, size).require ();
3160       rtx tmp = gen_reg_rtx (mode);
3161       rvv_builder v (mode, 1, size);
3162       for (int i = 0; i < vec_len.to_constant () / 8; i++)
3163         {
3164           uint8_t value = 0;
3165           for (int j = 0; j < 8; j++)
3166             {
3167               int index = i * 8 + j;
3168               if (known_lt (d->perm[index], 256))
3169                 value |= 1 << j;
3170             }
3171           v.quick_push (gen_int_mode (value, QImode));
3172         }
3173       emit_move_insn (tmp, v.build ());
3174       emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3175     }
3176
3177   /* TARGET = MASK ? OP0 : OP1.  */
3178   /* swap op0 and op1 since the order is opposite to pred_merge.  */
3179   rtx ops2[] = {d->target, d->op1, d->op0, mask};
3180   emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3181   return true;
3182 }
3183
3184 /* Recognize the consecutive index that we can use a single
3185    vrgather.v[x|i] to shuffle the vectors.
3186
3187    e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3188    Use SEW = 32, index = 1 vrgather.vi to get the result.  */
3189 static bool
3190 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3191 {
3192   machine_mode vmode = d->vmode;
3193   scalar_mode smode = GET_MODE_INNER (vmode);
3194   poly_int64 vec_len = d->perm.length ();
3195   HOST_WIDE_INT elt;
3196
3197   if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3198     return false;
3199   int vlen = vec_len.to_constant ();
3200
3201   /* Compute the last element index of consecutive pattern from the leading
3202      consecutive elements.  */
3203   int last_consecutive_idx = -1;
3204   int consecutive_num = -1;
3205   for (int i = 1; i < vlen; i++)
3206     {
3207       if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3208         break;
3209       last_consecutive_idx = i;
3210       consecutive_num = last_consecutive_idx + 1;
3211     }
3212
3213   int new_vlen = vlen / consecutive_num;
3214   if (last_consecutive_idx < 0 || consecutive_num == vlen
3215       || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3216     return false;
3217   /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3218      All elements of index, index + 1, ... index + consecutive_num - 1 should
3219      locate at the same vector.  */
3220   if (maybe_ge (d->perm[0], vec_len)
3221       != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3222     return false;
3223   /* If a vector has 8 elements.  We allow optimizations on consecutive
3224      patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3225      Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3226      to be optimized.  */
3227   if (d->perm[0].to_constant () % consecutive_num != 0)
3228     return false;
3229   unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3230   if (container_bits > 64)
3231     return false;
3232   else if (container_bits == 64)
3233     {
3234       if (!TARGET_VECTOR_ELEN_64)
3235         return false;
3236       else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3237         return false;
3238     }
3239
3240   /* Check the rest of elements are the same consecutive pattern.  */
3241   for (int i = consecutive_num; i < vlen; i++)
3242     if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3243       return false;
3244
3245   if (FLOAT_MODE_P (smode))
3246     smode = float_mode_for_size (container_bits).require ();
3247   else
3248     smode = int_mode_for_size (container_bits, 0).require ();
3249   if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3250     return false;
3251   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3252
3253   /* Success! */
3254   if (d->testing_p)
3255     return true;
3256
3257   int index = elt / consecutive_num;
3258   if (index >= new_vlen)
3259     index = index - new_vlen;
3260   rtx sel = gen_const_vector_dup (sel_mode, index);
3261   rtx op = elt >= vlen ? d->op0 : d->op1;
3262   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3263                           gen_lowpart (vmode, op), sel);
3264   return true;
3265 }
3266
3267 /* Recognize the patterns that we can use compress operation to shuffle the
3268    vectors. The perm selector of compress pattern is divided into 2 part:
3269    The first part is the random index number < NUNITS.
3270    The second part is consecutive last N index number >= NUNITS.
3271
3272    E.g.
3273    v = VEC_PERM_EXPR (v0, v1, selector),
3274    selector = { 0, 2, 6, 7 }
3275
3276    We can transform such pattern into:
3277
3278    op1 = vcompress (op0, mask)
3279    mask = { 1, 0, 1, 0 }
3280    v = op1.  */
3281
3282 static bool
3283 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3284 {
3285   machine_mode vmode = d->vmode;
3286   poly_int64 vec_len = d->perm.length ();
3287
3288   if (!vec_len.is_constant ())
3289     return false;
3290
3291   int vlen = vec_len.to_constant ();
3292
3293   /* It's not worthwhile the compress pattern has elements < 4
3294      and we can't modulo indices for compress pattern.  */
3295   if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3296     return false;
3297
3298   /* Compress pattern doesn't work for one vector.  */
3299   if (d->one_vector_p)
3300     return false;
3301
3302   /* Compress point is the point that all elements value with index i >=
3303      compress point of the selector are all consecutive series increasing and
3304      each selector value >= NUNITS. In this case, we could compress all elements
3305      of i < compress point into the op1.  */
3306   int compress_point = -1;
3307   for (int i = 0; i < vlen; i++)
3308     {
3309       if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3310         {
3311           compress_point = i;
3312           break;
3313         }
3314     }
3315
3316   /* We don't apply compress approach if we can't find the compress point.  */
3317   if (compress_point < 0)
3318     return false;
3319
3320   /* We can only apply compress approach when all index values from 0 to
3321      compress point are increasing.  */
3322   for (int i = 1; i < compress_point; i++)
3323     if (maybe_le (d->perm[i], d->perm[i - 1]))
3324       return false;
3325
3326   /* It must be series increasing from compress point.  */
3327   for (int i = 1 + compress_point; i < vlen; i++)
3328     if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3329       return false;
3330
3331   /* Success!  */
3332   if (d->testing_p)
3333     return true;
3334
3335   /* Check whether we need to slideup op1 to apply compress approach.
3336
3337        E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3338             is 2 * NUNITS - 1, so we don't need to slide up.
3339
3340             For index = { 0, 2, 5, 6}, we need to slide op1 up before
3341             we apply compress approach.  */
3342   bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3343                         && !const_vec_duplicate_p (d->op1);
3344
3345   /* If we leave it directly be handled by general gather,
3346      the code sequence will be:
3347         VECTOR LOAD  selector
3348         GEU          mask, selector, NUNITS
3349         GATHER       dest, op0, selector
3350         SUB          selector, selector, NUNITS
3351         GATHER       dest, op1, selector, mask
3352      Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3353      as COST = 4. So, we consider the general gather handling COST = 9.
3354      TODO: This cost is not accurate, we can adjust it by tune info.  */
3355   int general_cost = 9;
3356
3357   /* If we can use compress approach, the code sequence will be:
3358         MASK LOAD    mask
3359         COMPRESS     op1, op0, mask
3360      If it needs slide up, it will be:
3361         MASK LOAD    mask
3362         SLIDEUP      op1
3363         COMPRESS     op1, op0, mask
3364      By default, mask load COST = 2.
3365      TODO: This cost is not accurate, we can adjust it by tune info.  */
3366   int compress_cost = 4;
3367
3368   if (general_cost <= compress_cost)
3369     return false;
3370
3371   /* Build a mask that is true when selector element is true.  */
3372   machine_mode mask_mode = get_mask_mode (vmode);
3373   rvv_builder builder (mask_mode, vlen, 1);
3374   for (int i = 0; i < vlen; i++)
3375     {
3376       bool is_compress_index = false;
3377       for (int j = 0; j < compress_point; j++)
3378         {
3379           if (known_eq (d->perm[j], i))
3380             {
3381               is_compress_index = true;
3382               break;
3383             }
3384         }
3385       if (is_compress_index)
3386         builder.quick_push (CONST1_RTX (BImode));
3387       else
3388         builder.quick_push (CONST0_RTX (BImode));
3389     }
3390   rtx mask = force_reg (mask_mode, builder.build ());
3391
3392   rtx merge = d->op1;
3393   if (need_slideup_p)
3394     {
3395       int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3396       merge = gen_reg_rtx (vmode);
3397       rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3398       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3399       emit_vlmax_insn (icode, BINARY_OP, ops);
3400     }
3401
3402   insn_code icode = code_for_pred_compress (vmode);
3403   rtx ops[] = {d->target, merge, d->op0, mask};
3404   emit_nonvlmax_insn (icode, COMPRESS_OP_MERGE, ops,
3405                       gen_int_mode (vlen, Pmode));
3406   return true;
3407 }
3408
3409 /* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower
3410    or the higher parts of both vectors are combined into one.  */
3411
3412 static bool
3413 shuffle_slide_patterns (struct expand_vec_perm_d *d)
3414 {
3415   machine_mode vmode = d->vmode;
3416   poly_int64 vec_len = d->perm.length ();
3417
3418   if (!vec_len.is_constant ())
3419     return false;
3420
3421   int vlen = vec_len.to_constant ();
3422   if (vlen < 4)
3423     return false;
3424
3425   if (d->one_vector_p)
3426     return false;
3427
3428   /* For a slideup OP0 can stay, for a slidedown OP1 can.
3429      The former requires that the first element of the permutation
3430      is the first element of OP0, the latter that the last permutation
3431      element is the last element of OP1.  */
3432   bool slideup = false;
3433   bool slidedown = false;
3434
3435   /* For a slideup the permutation must start at OP0's first element.  */
3436   if (known_eq (d->perm[0], 0))
3437     slideup = true;
3438
3439   /* For a slidedown the permutation must end at OP1's last element.  */
3440   if (known_eq (d->perm[vlen - 1], 2 * vlen - 1))
3441     slidedown = true;
3442
3443   if (slideup && slidedown)
3444     return false;
3445
3446   if (!slideup && !slidedown)
3447     return false;
3448
3449   /* Check for a monotonic sequence with one pivot.  */
3450   int pivot = -1;
3451   for (int i = 0; i < vlen; i++)
3452     {
3453       if (pivot == -1 && known_ge (d->perm[i], vec_len))
3454         pivot = i;
3455       if (i > 0 && i != pivot
3456           && maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3457         return false;
3458     }
3459
3460   if (pivot == -1)
3461     return false;
3462
3463   /* For a slideup OP1's part (to be slid up) must be a low part,
3464      i.e. starting with its first element.  */
3465   if (slideup && maybe_ne (d->perm[pivot], vlen))
3466       return false;
3467
3468   /* For a slidedown OP0's part (to be slid down) must be a high part,
3469      i.e. ending with its last element.  */
3470   if (slidedown && maybe_ne (d->perm[pivot - 1], vlen - 1))
3471     return false;
3472
3473   /* Success!  */
3474   if (d->testing_p)
3475     return true;
3476
3477   /* PIVOT is the start of the lower/higher part of OP1 or OP2.
3478      For a slideup it indicates how many elements of OP1 to
3479      skip/slide over.  For a slidedown it indicates how long
3480      OP1's high part is, while VLEN - PIVOT is the amount to slide.  */
3481   int slide_cnt = slideup ? pivot : vlen - pivot;
3482   insn_code icode;
3483   if (slideup)
3484     {
3485       /* No need for a vector length because we slide up until the
3486          end of OP1 anyway.  */
3487       rtx ops[] = {d->target, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
3488       icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3489       emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3490     }
3491   else
3492     {
3493       /* Here we need a length because we slide to the beginning of OP1
3494          leaving the remaining elements undisturbed.  */
3495       int len = pivot;
3496       rtx ops[] = {d->target, d->op1, d->op0,
3497                    gen_int_mode (slide_cnt, Pmode)};
3498       icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
3499       emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
3500                           gen_int_mode (len, Pmode));
3501     }
3502
3503   return true;
3504 }
3505
3506 /* Recognize interleaving patterns like [0 4 1 5].  */
3507
3508 static bool
3509 shuffle_interleave_patterns (struct expand_vec_perm_d *d)
3510 {
3511   machine_mode vmode = d->vmode;
3512   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3513   poly_int64 vec_len = d->perm.length ();
3514   int n_patterns = d->perm.encoding ().npatterns ();
3515
3516   if (!vec_len.is_constant ())
3517     return false;
3518
3519   if (n_patterns != 2)
3520     return false;
3521
3522   unsigned vlen = vec_len.to_constant ();
3523
3524   if (vlen < 4 || vlen > 64)
3525     return false;
3526
3527   if (d->one_vector_p)
3528     return false;
3529
3530   bool low = true;
3531   if (d->perm.series_p (0, 2, 0, 1)
3532       && d->perm.series_p (1, 2, vlen, 1))
3533     low = true;
3534   else if (d->perm.series_p (0, 2, vlen / 2, 1)
3535            && d->perm.series_p (1, 2, vlen + vlen / 2, 1))
3536     low = false;
3537   else
3538     return false;
3539
3540   vec_perm_builder sel (vlen, 2, 1);
3541   sel.safe_grow (vlen);
3542   int cnt = 0;
3543   for (unsigned i = 0; i < vlen; i += 2)
3544     {
3545       sel[i] = cnt;
3546       sel[i + 1] = cnt + vlen / 2;
3547       cnt++;
3548     }
3549
3550   vec_perm_indices indices (sel, 2, vlen);
3551
3552   if (vlen != indices.length ().to_constant ())
3553     return false;
3554
3555   /* Success!  */
3556   if (d->testing_p)
3557     return true;
3558
3559   int slide_cnt = vlen / 2;
3560   rtx tmp = gen_reg_rtx (vmode);
3561
3562   if (low)
3563     {
3564       /* No need for a vector length because we slide up until the
3565          end of OP1 anyway.  */
3566       rtx ops[] = {tmp, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
3567       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3568       emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3569     }
3570   else
3571     {
3572       rtx ops[] = {tmp, d->op1, d->op0, gen_int_mode (slide_cnt, Pmode)};
3573       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
3574       emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
3575                           gen_int_mode (slide_cnt, Pmode));
3576     }
3577
3578   rtx sel_rtx = vec_perm_indices_to_rtx (sel_mode, indices);
3579   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target), tmp, sel_rtx);
3580
3581   return true;
3582 }
3583
3584
3585 /* Recognize even/odd patterns like [0 2 4 6].  We use two compress
3586    and one slideup.  */
3587
3588 static bool
3589 shuffle_even_odd_patterns (struct expand_vec_perm_d *d)
3590 {
3591   machine_mode vmode = d->vmode;
3592   poly_int64 vec_len = d->perm.length ();
3593   int n_patterns = d->perm.encoding ().npatterns ();
3594
3595   if (n_patterns != 1)
3596     return false;
3597
3598   if (!vec_len.is_constant ())
3599     return false;
3600
3601   int vlen = vec_len.to_constant ();
3602   if (vlen < 4 || vlen > 64)
3603     return false;
3604
3605   if (d->one_vector_p)
3606     return false;
3607
3608   bool even = true;
3609   if (!d->perm.series_p (0, 1, 0, 2))
3610     {
3611       even = false;
3612       if (!d->perm.series_p (0, 1, 1, 2))
3613         return false;
3614     }
3615
3616   /* Success!  */
3617   if (d->testing_p)
3618     return true;
3619
3620   machine_mode mask_mode = get_mask_mode (vmode);
3621   rvv_builder builder (mask_mode, vlen, 1);
3622   int bit = even ? 0 : 1;
3623   for (int i = 0; i < vlen; i++)
3624     {
3625       bit ^= 1;
3626       if (bit)
3627         builder.quick_push (CONST1_RTX (BImode));
3628       else
3629         builder.quick_push (CONST0_RTX (BImode));
3630     }
3631   rtx mask = force_reg (mask_mode, builder.build ());
3632
3633   insn_code icode = code_for_pred_compress (vmode);
3634   rtx ops1[] = {d->target, d->op0, mask};
3635   emit_vlmax_insn (icode, COMPRESS_OP, ops1);
3636
3637   rtx tmp2 = gen_reg_rtx (vmode);
3638   rtx ops2[] = {tmp2, d->op1, mask};
3639   emit_vlmax_insn (icode, COMPRESS_OP, ops2);
3640
3641   rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)};
3642   icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3643   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3644
3645   return true;
3646 }
3647
3648 /* Recognize decompress patterns:
3649
3650    1. VEC_PERM_EXPR op0 and op1
3651       with isel = { 0, nunits, 1, nunits + 1, ... }.
3652       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3653
3654    2. VEC_PERM_EXPR op0 and op1
3655       with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3656       Slide down op0 and op1 with OFFSET = 1/2 nunits.
3657       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3658 */
3659 static bool
3660 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3661 {
3662   poly_uint64 nelt = d->perm.length ();
3663   machine_mode mask_mode = get_mask_mode (d->vmode);
3664
3665   /* For constant size indices, we dont't need to handle it here.
3666      Just leave it to vec_perm<mode>.  */
3667   if (d->perm.length ().is_constant ())
3668     return false;
3669
3670   poly_uint64 first = d->perm[0];
3671   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3672       || !d->perm.series_p (0, 2, first, 1)
3673       || !d->perm.series_p (1, 2, first + nelt, 1))
3674     return false;
3675
3676   /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3677      Otherwise, it could overflow the index range.  */
3678   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3679   if (GET_MODE_INNER (d->vmode) == QImode
3680       && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3681     return false;
3682
3683   /* Success!  */
3684   if (d->testing_p)
3685     return true;
3686
3687   rtx op0, op1;
3688   if (known_eq (first, 0U))
3689     {
3690       op0 = d->op0;
3691       op1 = d->op1;
3692     }
3693   else
3694     {
3695       op0 = gen_reg_rtx (d->vmode);
3696       op1 = gen_reg_rtx (d->vmode);
3697       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3698       rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3699       rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3700       emit_vlmax_insn (icode, BINARY_OP, ops0);
3701       emit_vlmax_insn (icode, BINARY_OP, ops1);
3702     }
3703   /* Generate { 0, 1, .... } mask.  */
3704   rtx vid = gen_reg_rtx (sel_mode);
3705   rtx vid_repeat = gen_reg_rtx (sel_mode);
3706   expand_vec_series (vid, const0_rtx, const1_rtx);
3707   rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3708   emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3709   rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3710   rtx mask = gen_reg_rtx (mask_mode);
3711   expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3712   emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3713   return true;
3714 }
3715
3716 static bool
3717 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3718 {
3719   HOST_WIDE_INT diff;
3720   unsigned i, size, step;
3721
3722   if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3723     return false;
3724
3725   step = diff + 1;
3726   size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3727
3728   switch (size)
3729     {
3730     case 16:
3731       break;
3732     case 32:
3733     case 64:
3734       /* We will have VEC_PERM_EXPR after rtl expand when invoking
3735          __builtin_bswap. It will generate about 9 instructions in
3736          loop as below, no matter it is bswap16, bswap32 or bswap64.
3737            .L2:
3738          1 vle16.v v4,0(a0)
3739          2 vmv.v.x v2,a7
3740          3 vand.vv v2,v6,v2
3741          4 slli    a2,a5,1
3742          5 vrgatherei16.vv v1,v4,v2
3743          6 sub     a4,a4,a5
3744          7 vse16.v v1,0(a3)
3745          8 add     a0,a0,a2
3746          9 add     a3,a3,a2
3747            bne     a4,zero,.L2
3748
3749          But for bswap16 we may have a even simple code gen, which
3750          has only 7 instructions in loop as below.
3751            .L5
3752          1 vle8.v  v2,0(a5)
3753          2 addi    a5,a5,32
3754          3 vsrl.vi v4,v2,8
3755          4 vsll.vi v2,v2,8
3756          5 vor.vv  v4,v4,v2
3757          6 vse8.v  v4,0(a4)
3758          7 addi    a4,a4,32
3759            bne     a5,a6,.L5
3760
3761          Unfortunately, the instructions in loop will grow to 13 and 24
3762          for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3763          for both the bswap64 and bswap32, but take shift and or (7 insn)
3764          for bswap16.
3765        */
3766     default:
3767       return false;
3768     }
3769
3770   for (i = 0; i < step; i++)
3771     if (!d->perm.series_p (i, step, diff - i, step))
3772       return false;
3773
3774   /* Disable when nunits < 4 since the later generic approach
3775      is more profitable on BSWAP.  */
3776   if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3777     return false;
3778
3779   if (d->testing_p)
3780     return true;
3781
3782   machine_mode vhi_mode;
3783   poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3784
3785   if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3786     return false;
3787
3788   /* Step-1: Move op0 to src with VHI mode.  */
3789   rtx src = gen_reg_rtx (vhi_mode);
3790   emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3791
3792   /* Step-2: Shift right 8 bits to dest.  */
3793   rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3794                            NULL_RTX, 0, OPTAB_DIRECT);
3795
3796   /* Step-3: Shift left 8 bits to src.  */
3797   src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3798                       NULL_RTX, 0, OPTAB_DIRECT);
3799
3800   /* Step-4: Logic Or dest and src to dest.  */
3801   dest = expand_binop (vhi_mode, ior_optab, dest, src,
3802                        NULL_RTX, 0, OPTAB_DIRECT);
3803
3804   /* Step-5: Move src to target with VQI mode.  */
3805   emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3806
3807   return true;
3808 }
3809
3810 /* Recognize patterns like [3 4 5 6] where we combine the last element
3811    of the first vector and the first n - 1 elements of the second vector.
3812    This can be implemented by slides or by extracting and re-inserting
3813    (slide1up) the first vector's last element.  */
3814
3815 static bool
3816 shuffle_off_by_one_patterns (struct expand_vec_perm_d *d)
3817 {
3818   poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3819
3820   /* Recognize { nunits - 1, nunits, nunits + 1, ... }.  */
3821   if (!d->perm.series_p (0, 2, nunits - 1, 2)
3822       || !d->perm.series_p (1, 2, nunits, 2))
3823     return false;
3824
3825   /* Disable when nunits < 4 since the later generic approach
3826      is more profitable on indice = { nunits - 1, nunits }.  */
3827   if (!known_gt (nunits, 2))
3828     return false;
3829
3830   /* Success! */
3831   if (d->testing_p)
3832     return true;
3833
3834   int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS)
3835     + riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2;
3836   int slide_cost = 2;
3837
3838   if (slide_cost < scalar_cost)
3839     {
3840       /* This variant should always be preferable because we just need two
3841          slides.  The extract-variant also requires two slides but additionally
3842          pays the latency for register-file crossing.  */
3843       rtx tmp = gen_reg_rtx (d->vmode);
3844       rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)};
3845       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode);
3846       emit_vlmax_insn (icode, BINARY_OP, ops);
3847
3848       rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)};
3849       icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3850       emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, Pmode));
3851     }
3852   else
3853     {
3854       /* Extract the last element of the first vector.  */
3855       scalar_mode smode = GET_MODE_INNER (d->vmode);
3856       rtx tmp = gen_reg_rtx (smode);
3857       emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3858
3859       /* Insert the scalar into element 0.  */
3860       unsigned int unspec
3861         = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3862       insn_code icode = code_for_pred_slide (unspec, d->vmode);
3863       rtx ops[] = {d->target, d->op1, tmp};
3864       emit_vlmax_insn (icode, BINARY_OP, ops);
3865     }
3866
3867   return true;
3868 }
3869
3870 /* This looks for a series pattern in the provided vector permute structure D.
3871    If successful it emits a series insn as well as a gather to implement it.
3872    Return true if successful, false otherwise.  */
3873
3874 static bool
3875 shuffle_series_patterns (struct expand_vec_perm_d *d)
3876 {
3877   if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3878     return false;
3879
3880   poly_int64 el1 = d->perm[0];
3881   poly_int64 el2 = d->perm[1];
3882   poly_int64 el3 = d->perm[2];
3883
3884   poly_int64 step1 = el2 - el1;
3885   poly_int64 step2 = el3 - el2;
3886
3887   bool need_insert = false;
3888   bool have_series = false;
3889
3890   /* Check for a full series.  */
3891   if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3892     have_series = true;
3893
3894   /* Check for a series starting at the second element.  */
3895   else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3896     {
3897       have_series = true;
3898       need_insert = true;
3899     }
3900
3901   if (!have_series)
3902     return false;
3903
3904   /* Disable shuffle if we can't find an appropriate integer index mode for
3905      gather.  */
3906   machine_mode sel_mode;
3907   if (!get_gather_index_mode (d).exists (&sel_mode))
3908     return false;
3909
3910   /* Success! */
3911   if (d->testing_p)
3912     return true;
3913
3914   /* Create the series.  */
3915   machine_mode eltmode = Pmode;
3916   rtx series = gen_reg_rtx (sel_mode);
3917   expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3918                      gen_int_mode (need_insert ? step2 : step1, eltmode));
3919
3920   /* Insert the remaining element if necessary.  */
3921   if (need_insert)
3922     {
3923       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3924       rtx ops[]
3925         = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3926       emit_vlmax_insn (icode, BINARY_OP, ops);
3927     }
3928
3929   emit_vlmax_gather_insn (d->target, d->op0, series);
3930
3931   return true;
3932 }
3933
3934 /* Recognize the pattern that can be shuffled by generic approach.  */
3935
3936 static bool
3937 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3938 {
3939   machine_mode sel_mode;
3940
3941   /* We don't enable SLP for non-power of 2 NPATTERNS.  */
3942   if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3943     return false;
3944
3945   /* Disable shuffle if we can't find an appropriate integer index mode for
3946      gather.  */
3947   if (!get_gather_index_mode (d).exists (&sel_mode))
3948     return false;
3949
3950   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3951   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
3952   rtx elt;
3953
3954   bool is_simple = d->one_vector_p
3955     || const_vec_duplicate_p (sel, &elt)
3956     || (nunits.is_constant ()
3957         && const_vec_all_in_range_p (sel, 0, nunits - 1));
3958
3959   if (!is_simple && !riscv_two_source_permutes)
3960     return false;
3961
3962   /* Success! */
3963   if (d->testing_p)
3964     return true;
3965
3966   /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3967      instead of expand vec_perm<mode>, we handle it directly.  */
3968   expand_vec_perm (d->target, d->op0, d->op1, sel);
3969   return true;
3970 }
3971
3972 /* This function recognizes and supports different permutation patterns
3973    and enable VLA SLP auto-vectorization.  */
3974 static bool
3975 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3976 {
3977   gcc_assert (d->op_mode != E_VOIDmode);
3978
3979   /* The pattern matching functions above are written to look for a small
3980      number to begin the sequence (0, 1, N/2).  If we begin with an index
3981      from the second operand, we can swap the operands.  */
3982   poly_int64 nelt = d->perm.length ();
3983   if (known_ge (d->perm[0], nelt))
3984     {
3985       d->perm.rotate_inputs (1);
3986       std::swap (d->op0, d->op1);
3987     }
3988
3989   if (known_gt (nelt, 1))
3990     {
3991       if (d->vmode == d->op_mode)
3992         {
3993           if (shuffle_merge_patterns (d))
3994             return true;
3995           if (shuffle_consecutive_patterns (d))
3996             return true;
3997           if (shuffle_slide_patterns (d))
3998             return true;
3999           if (shuffle_interleave_patterns (d))
4000             return true;
4001           if (shuffle_even_odd_patterns (d))
4002             return true;
4003           if (shuffle_compress_patterns (d))
4004             return true;
4005           if (shuffle_decompress_patterns (d))
4006             return true;
4007           if (shuffle_bswap_pattern (d))
4008             return true;
4009           if (shuffle_off_by_one_patterns (d))
4010             return true;
4011           if (shuffle_series_patterns (d))
4012             return true;
4013           if (shuffle_generic_patterns (d))
4014             return true;
4015           return false;
4016         }
4017       else
4018         return false;
4019     }
4020   return false;
4021 }
4022
4023 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
4024  * instructions.  */
4025 bool
4026 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
4027                        rtx op0, rtx op1, const vec_perm_indices &sel)
4028 {
4029   /* RVV doesn't have Mask type pack/unpack instructions and we don't use
4030      mask to do the iteration loop control. Just disable it directly.  */
4031   if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
4032     return false;
4033
4034   struct expand_vec_perm_d d;
4035
4036   /* Check whether the mask can be applied to a single vector.  */
4037   if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
4038     d.one_vector_p = true;
4039   else if (sel.all_from_input_p (0))
4040     {
4041       d.one_vector_p = true;
4042       op1 = op0;
4043     }
4044   else if (sel.all_from_input_p (1))
4045     {
4046       d.one_vector_p = true;
4047       op0 = op1;
4048     }
4049   else
4050     d.one_vector_p = false;
4051
4052   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
4053                      sel.nelts_per_input ());
4054   d.vmode = vmode;
4055   d.op_mode = op_mode;
4056   d.target = target;
4057   d.op0 = op0;
4058   if (op0 == op1)
4059     d.op1 = d.op0;
4060   else
4061     d.op1 = op1;
4062   d.testing_p = !target;
4063
4064   if (!d.testing_p)
4065     return expand_vec_perm_const_1 (&d);
4066
4067   rtx_insn *last = get_last_insn ();
4068   bool ret = expand_vec_perm_const_1 (&d);
4069   gcc_assert (last == get_last_insn ());
4070
4071   return ret;
4072 }
4073
4074 /* Generate no side effects vsetvl to get the vector length.  */
4075 void
4076 expand_select_vl (rtx *ops)
4077 {
4078   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
4079   if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
4080     {
4081       /* If length is known <= VF, we just use the length directly instead
4082          of using vsetvli.
4083
4084          E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
4085          We move 3 into _255 instead of using explicit vsetvl.  */
4086       emit_move_insn (ops[0], ops[1]);
4087       return;
4088     }
4089   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
4090      since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
4091   scalar_int_mode mode = QImode;
4092   machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
4093   emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
4094 }
4095
4096 /* Return RVV_VUNDEF if the ELSE value is scratch rtx.  */
4097 static rtx
4098 get_else_operand (rtx op)
4099 {
4100   return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
4101 }
4102
4103 /* Expand MASK_LEN_{LOAD,STORE}.  */
4104 void
4105 expand_load_store (rtx *ops, bool is_load)
4106 {
4107   int idx = 2;
4108   rtx mask = ops[idx++];
4109   /* A masked load has a merge/else operand.  */
4110   if (is_load)
4111     get_else_operand (ops[idx++]);
4112   rtx len = ops[idx];
4113   machine_mode mode = GET_MODE (ops[0]);
4114
4115   if (is_vlmax_len_p (mode, len))
4116     {
4117       /* If the length operand is equal to VF, it is VLMAX load/store.  */
4118       if (is_load)
4119         {
4120           rtx m_ops[] = {ops[0], mask, ops[1]};
4121           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
4122         }
4123       else
4124         {
4125           len = gen_reg_rtx (Pmode);
4126           emit_vlmax_vsetvl (mode, len);
4127           emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
4128                                      get_avl_type_rtx (VLMAX)));
4129         }
4130     }
4131   else
4132     {
4133       if (!satisfies_constraint_K (len))
4134         len = force_reg (Pmode, len);
4135       if (is_load)
4136         {
4137           rtx m_ops[] = {ops[0], mask, ops[1]};
4138           emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
4139                                len);
4140         }
4141       else
4142         emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
4143                                    get_avl_type_rtx (NONVLMAX)));
4144     }
4145 }
4146
4147 /* Expand MASK_LEN_STRIDED_LOAD.  */
4148 void
4149 expand_strided_load (machine_mode mode, rtx *ops)
4150 {
4151   rtx v_reg = ops[0];
4152   rtx base = ops[1];
4153   rtx stride = ops[2];
4154   rtx mask = ops[3];
4155   int idx = 4;
4156   get_else_operand (ops[idx++]);
4157   rtx len = ops[idx];
4158   poly_int64 len_val;
4159
4160   insn_code icode = code_for_pred_strided_load (mode);
4161   rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
4162
4163   if (poly_int_rtx_p (len, &len_val)
4164       && known_eq (len_val, GET_MODE_NUNITS (mode)))
4165     emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
4166   else
4167     {
4168       len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
4169       emit_nonvlmax_insn (icode, BINARY_OP_TAMA, emit_ops, len);
4170     }
4171 }
4172
4173 /* Expand MASK_LEN_STRIDED_STORE.  */
4174 void
4175 expand_strided_store (machine_mode mode, rtx *ops)
4176 {
4177   rtx v_reg = ops[2];
4178   rtx base = ops[0];
4179   rtx stride = ops[1];
4180   rtx mask = ops[3];
4181   rtx len = ops[4];
4182   poly_int64 len_val;
4183   rtx vl_type;
4184
4185   if (poly_int_rtx_p (len, &len_val)
4186       && known_eq (len_val, GET_MODE_NUNITS (mode)))
4187     {
4188       len = gen_reg_rtx (Pmode);
4189       emit_vlmax_vsetvl (mode, len);
4190       vl_type = get_avl_type_rtx (VLMAX);
4191     }
4192   else
4193     {
4194       len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
4195       vl_type = get_avl_type_rtx (NONVLMAX);
4196     }
4197
4198   emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, base),
4199                                      mask, stride, v_reg, len, vl_type));
4200 }
4201
4202 /* Return true if the operation is the floating-point operation need FRM.  */
4203 static bool
4204 needs_fp_rounding (unsigned icode, machine_mode mode)
4205 {
4206   if (!FLOAT_MODE_P (mode))
4207     return false;
4208
4209   return icode != maybe_code_for_pred (SMIN, mode)
4210          && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
4211          && icode != maybe_code_for_pred (SMAX, mode)
4212          && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
4213          && icode != maybe_code_for_pred (NEG, mode)
4214          && icode != maybe_code_for_pred (ABS, mode)
4215          /* narrower-FP -> FP */
4216          && icode != maybe_code_for_pred_extend (mode)
4217          /* narrower-INT -> FP */
4218          && icode != maybe_code_for_pred_widen (FLOAT, mode)
4219          && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
4220          /* vfsgnj */
4221          && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
4222          && icode != maybe_code_for_pred_mov (mode);
4223 }
4224
4225 /* Subroutine to expand COND_LEN_* patterns.  */
4226 static void
4227 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
4228 {
4229   rtx dest = ops[0];
4230   rtx mask = ops[1];
4231   machine_mode mode = GET_MODE (dest);
4232   machine_mode mask_mode = GET_MODE (mask);
4233   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
4234   bool is_vlmax_len = is_vlmax_len_p (mode, len);
4235
4236   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
4237   /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
4238      dummy mask) into NEG_EXPR in GIMPLE FOLD yet.  So, we do such
4239      simplification in RISC-V backend and may do that in middle-end in the
4240      future.  */
4241   if (is_dummy_mask && is_vlmax_len)
4242     insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
4243   else if (is_dummy_mask)
4244     insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
4245   else if (is_vlmax_len)
4246     insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
4247   else
4248     insn_flags |= TU_POLICY_P | MU_POLICY_P;
4249
4250   if (needs_fp_rounding (icode, mode))
4251     insn_flags |= FRM_DYN_P;
4252
4253   if (is_vlmax_len)
4254     emit_vlmax_insn (icode, insn_flags, ops);
4255   else
4256     emit_nonvlmax_insn (icode, insn_flags, ops, len);
4257 }
4258
4259 /* Expand unary ops COND_LEN_*.  */
4260 void
4261 expand_cond_len_unop (unsigned icode, rtx *ops)
4262 {
4263   rtx dest = ops[0];
4264   rtx mask = ops[1];
4265   rtx src = ops[2];
4266   rtx merge = get_else_operand (ops[3]);
4267   rtx len = ops[4];
4268
4269   rtx cond_ops[] = {dest, mask, merge, src};
4270   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
4271 }
4272
4273 /* Expand unary ops COND_*.  */
4274 void
4275 expand_cond_unop (unsigned icode, rtx *ops)
4276 {
4277   rtx dest = ops[0];
4278   rtx mask = ops[1];
4279   rtx src = ops[2];
4280   rtx merge = get_else_operand (ops[3]);
4281   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4282
4283   rtx cond_ops[] = {dest, mask, merge, src};
4284   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
4285 }
4286
4287 /* Expand binary ops COND_LEN_*.  */
4288 void
4289 expand_cond_len_binop (unsigned icode, rtx *ops)
4290 {
4291   rtx dest = ops[0];
4292   rtx mask = ops[1];
4293   rtx src1 = ops[2];
4294   rtx src2 = ops[3];
4295   rtx merge = get_else_operand (ops[4]);
4296   rtx len = ops[5];
4297
4298   rtx cond_ops[] = {dest, mask, merge, src1, src2};
4299   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4300 }
4301
4302 /* Expand binary ops COND_*.  */
4303 void
4304 expand_cond_binop (unsigned icode, rtx *ops)
4305 {
4306   rtx dest = ops[0];
4307   rtx mask = ops[1];
4308   rtx src1 = ops[2];
4309   rtx src2 = ops[3];
4310   rtx merge = get_else_operand (ops[4]);
4311   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4312
4313   rtx cond_ops[] = {dest, mask, merge, src1, src2};
4314   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4315 }
4316
4317 /* Prepare insn_code for gather_load/scatter_store according to
4318    the vector mode and index mode.  */
4319 static insn_code
4320 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
4321                         bool is_load)
4322 {
4323   if (!is_load)
4324     return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
4325   else
4326     {
4327       unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
4328       unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
4329       if (dst_eew_bitsize == src_eew_bitsize)
4330         return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
4331       else if (dst_eew_bitsize > src_eew_bitsize)
4332         {
4333           unsigned factor = dst_eew_bitsize / src_eew_bitsize;
4334           switch (factor)
4335             {
4336             case 2:
4337               return code_for_pred_indexed_load_x2_greater_eew (
4338                 UNSPEC_UNORDERED, vec_mode);
4339             case 4:
4340               return code_for_pred_indexed_load_x4_greater_eew (
4341                 UNSPEC_UNORDERED, vec_mode);
4342             case 8:
4343               return code_for_pred_indexed_load_x8_greater_eew (
4344                 UNSPEC_UNORDERED, vec_mode);
4345             default:
4346               gcc_unreachable ();
4347             }
4348         }
4349       else
4350         {
4351           unsigned factor = src_eew_bitsize / dst_eew_bitsize;
4352           switch (factor)
4353             {
4354             case 2:
4355               return code_for_pred_indexed_load_x2_smaller_eew (
4356                 UNSPEC_UNORDERED, vec_mode);
4357             case 4:
4358               return code_for_pred_indexed_load_x4_smaller_eew (
4359                 UNSPEC_UNORDERED, vec_mode);
4360             case 8:
4361               return code_for_pred_indexed_load_x8_smaller_eew (
4362                 UNSPEC_UNORDERED, vec_mode);
4363             default:
4364               gcc_unreachable ();
4365             }
4366         }
4367     }
4368 }
4369
4370 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}.  */
4371 void
4372 expand_gather_scatter (rtx *ops, bool is_load)
4373 {
4374   rtx ptr, vec_offset, vec_reg;
4375   bool zero_extend_p;
4376   int shift;
4377   rtx mask = ops[5];
4378   rtx len = ops[6];
4379   if (is_load)
4380     len = ops[7];
4381   if (is_load)
4382     {
4383       vec_reg = ops[0];
4384       ptr = ops[1];
4385       vec_offset = ops[2];
4386       zero_extend_p = INTVAL (ops[3]);
4387       shift = exact_log2 (INTVAL (ops[4]));
4388     }
4389   else
4390     {
4391       vec_reg = ops[4];
4392       ptr = ops[0];
4393       vec_offset = ops[1];
4394       zero_extend_p = INTVAL (ops[2]);
4395       shift = exact_log2 (INTVAL (ops[3]));
4396     }
4397
4398   machine_mode vec_mode = GET_MODE (vec_reg);
4399   machine_mode idx_mode = GET_MODE (vec_offset);
4400   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4401   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4402   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4403   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4404
4405   bool use_widening_shift = false;
4406
4407   /* Extend the offset element to address width.  */
4408   if (inner_offsize < BITS_PER_WORD)
4409     {
4410       use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
4411       /* 7.2. Vector Load/Store Addressing Modes.
4412          If the vector offset elements are narrower than XLEN, they are
4413          zero-extended to XLEN before adding to the ptr effective address. If
4414          the vector offset elements are wider than XLEN, the least-significant
4415          XLEN bits are used in the address calculation. An implementation must
4416          raise an illegal instruction exception if the EEW is not supported for
4417          offset elements.
4418
4419          RVV spec only refers to the shift == 0 case.  */
4420       if (!zero_extend_p || shift)
4421         {
4422           if (zero_extend_p)
4423             inner_idx_mode
4424               = int_mode_for_size (inner_offsize * 2, 0).require ();
4425           else
4426             inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4427           machine_mode new_idx_mode
4428             = get_vector_mode (inner_idx_mode, nunits).require ();
4429           if (!use_widening_shift)
4430             {
4431               rtx tmp = gen_reg_rtx (new_idx_mode);
4432               emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4433                                           zero_extend_p ? true : false));
4434               vec_offset = tmp;
4435             }
4436           idx_mode = new_idx_mode;
4437         }
4438     }
4439
4440   if (shift)
4441     {
4442       rtx tmp;
4443       if (!use_widening_shift)
4444         tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4445                             gen_int_mode (shift, Pmode), NULL_RTX, 0,
4446                             OPTAB_DIRECT);
4447       else
4448         {
4449           tmp = gen_reg_rtx (idx_mode);
4450           insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
4451           rtx ops[] = {tmp, vec_offset, const1_rtx};
4452           emit_vlmax_insn (icode, BINARY_OP, ops);
4453         }
4454
4455       vec_offset = tmp;
4456     }
4457
4458   insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4459   if (is_vlmax)
4460     {
4461       if (is_load)
4462         {
4463           rtx load_ops[]
4464             = {vec_reg, mask, ptr, vec_offset};
4465           emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4466         }
4467       else
4468         {
4469           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4470           emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4471         }
4472     }
4473   else
4474     {
4475       if (is_load)
4476         {
4477           rtx load_ops[]
4478             = {vec_reg, mask, ptr, vec_offset};
4479           emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4480         }
4481       else
4482         {
4483           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4484           emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4485         }
4486     }
4487 }
4488
4489 /* Expand COND_LEN_*.  */
4490 void
4491 expand_cond_len_ternop (unsigned icode, rtx *ops)
4492 {
4493   rtx dest = ops[0];
4494   rtx mask = ops[1];
4495   rtx src1 = ops[2];
4496   rtx src2 = ops[3];
4497   rtx src3 = ops[4];
4498   rtx merge = get_else_operand (ops[5]);
4499   rtx len = ops[6];
4500
4501   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4502   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4503 }
4504
4505 /* Expand COND_*.  */
4506 void
4507 expand_cond_ternop (unsigned icode, rtx *ops)
4508 {
4509   rtx dest = ops[0];
4510   rtx mask = ops[1];
4511   rtx src1 = ops[2];
4512   rtx src2 = ops[3];
4513   rtx src3 = ops[4];
4514   rtx merge = get_else_operand (ops[5]);
4515   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4516
4517   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4518   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4519 }
4520
4521 /* Expand reduction operations.
4522      Case 1: ops = {scalar_dest, vector_src}
4523      Case 2: ops = {scalar_dest, vector_src, mask, vl}
4524 */
4525 void
4526 expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
4527                   unsigned insn_flags, rtx *ops, rtx init)
4528 {
4529   rtx scalar_dest = ops[0];
4530   rtx vector_src = ops[1];
4531   machine_mode vmode = GET_MODE (vector_src);
4532   machine_mode vel_mode = GET_MODE (scalar_dest);
4533   machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4534   rtx vl_op = NULL_RTX;
4535   bool need_vl0_safe = false;
4536   if (need_mask_operand_p (insn_flags))
4537     {
4538       vl_op = ops[3];
4539       need_vl0_safe = !CONST_INT_P (vl_op) && !CONST_POLY_INT_P (vl_op);
4540     }
4541
4542   rtx m1_tmp = gen_reg_rtx (m1_mode);
4543   rtx scalar_move_ops[] = {m1_tmp, init};
4544   insn_code icode = code_for_pred_broadcast (m1_mode);
4545   if (need_mask_operand_p (insn_flags))
4546     {
4547       if (need_vl0_safe)
4548         emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
4549       else
4550         emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
4551     }
4552   else
4553     emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
4554
4555   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4556   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4557
4558   if (need_vl0_safe)
4559     icode = code_for_pred (unspec_for_vl0_safe, vmode);
4560   else
4561     icode = code_for_pred (unspec, vmode);
4562
4563   if (need_mask_operand_p (insn_flags))
4564     {
4565       rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4566       emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, vl_op);
4567     }
4568   else
4569     emit_vlmax_insn (icode, insn_flags, reduc_ops);
4570
4571   emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4572 }
4573
4574 /* Prepare ops for ternary operations.
4575    It can be called before or after RA.  */
4576 void
4577 prepare_ternary_operands (rtx *ops)
4578 {
4579   machine_mode mode = GET_MODE (ops[0]);
4580
4581   if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4582       && (VECTOR_MODE_P (GET_MODE (ops[2]))
4583           && !rtx_equal_p (ops[2], ops[5]))
4584       && !rtx_equal_p (ops[3], ops[5])
4585       && !rtx_equal_p (ops[4], ops[5]))
4586     {
4587       /* RA will fail to find vector REG and report ICE, so we pre-merge
4588          the ops for LMUL = 8.  */
4589       if (satisfies_constraint_Wc1 (ops[1]))
4590         {
4591           emit_move_insn (ops[0], ops[5]);
4592           emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4593                                    ops[7], ops[8], ops[9]));
4594         }
4595       else
4596         emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4597                                    ops[4], ops[1], ops[6], ops[7], ops[9]));
4598       ops[5] = ops[4] = ops[0];
4599     }
4600   else
4601     {
4602       /* Swap the multiplication ops if the fallback value is the
4603          second of the two.  */
4604       if (rtx_equal_p (ops[3], ops[5]))
4605         std::swap (ops[2], ops[3]);
4606
4607       /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4608          into PLUS (ASHIFT (a, 2), b) according to uarchs.  */
4609     }
4610   gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4611               || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4612 }
4613
4614 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}.  */
4615 void
4616 expand_lanes_load_store (rtx *ops, bool is_load)
4617 {
4618   rtx mask = ops[2];
4619   rtx len = ops[3];
4620   if (is_load)
4621     len = ops[4];
4622   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4623   rtx reg = is_load ? ops[0] : ops[1];
4624   machine_mode mode = GET_MODE (ops[0]);
4625
4626   if (is_vlmax_len_p (mode, len))
4627     {
4628       /* If the length operand is equal to VF, it is VLMAX load/store.  */
4629       if (is_load)
4630         {
4631           rtx m_ops[] = {reg, mask, addr};
4632           emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4633                             m_ops);
4634         }
4635       else
4636         {
4637           len = gen_reg_rtx (Pmode);
4638           emit_vlmax_vsetvl (mode, len);
4639           emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4640                                                   get_avl_type_rtx (VLMAX)));
4641         }
4642     }
4643   else
4644     {
4645       if (!satisfies_constraint_K (len))
4646         len = force_reg (Pmode, len);
4647       if (is_load)
4648         {
4649           rtx m_ops[] = {reg, mask, addr};
4650           emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4651                                UNARY_OP_TAMA, m_ops, len);
4652         }
4653       else
4654         emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4655                                                 get_avl_type_rtx (NONVLMAX)));
4656     }
4657 }
4658
4659 /* Expand LEN_FOLD_EXTRACT_LAST.  */
4660 void
4661 expand_fold_extract_last (rtx *ops)
4662 {
4663   rtx dst = ops[0];
4664   rtx default_value = ops[1];
4665   rtx mask = ops[2];
4666   rtx anchor = gen_reg_rtx (Pmode);
4667   rtx index = gen_reg_rtx (Pmode);
4668   rtx vect = ops[3];
4669   rtx else_label = gen_label_rtx ();
4670   rtx end_label = gen_label_rtx ();
4671   rtx len = ops[4];
4672   machine_mode mode = GET_MODE (vect);
4673   machine_mode mask_mode = GET_MODE (mask);
4674   rtx compress_vect = gen_reg_rtx (mode);
4675   rtx slide_vect = gen_reg_rtx (mode);
4676   insn_code icode;
4677
4678   if (is_vlmax_len_p (mode, len))
4679     len = NULL_RTX;
4680
4681   /* Calculate the number of 1-bit in mask. */
4682   rtx cpop_ops[] = {anchor, mask};
4683   if (len)
4684     emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4685                          cpop_ops, len);
4686   else
4687     emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4688                       cpop_ops);
4689
4690   riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4691   emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4692   /* Compress the vector.  */
4693   icode = code_for_pred_compress (mode);
4694   rtx compress_ops[] = {compress_vect, vect, mask};
4695   if (len)
4696     emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4697   else
4698     emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4699   /* Emit the slide down to index 0 in a new vector.  */
4700   rtx slide_ops[] = {slide_vect, compress_vect, index};
4701   icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4702   if (len)
4703     emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4704   else
4705     emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4706   /* Emit v(f)mv.[xf].s.  */
4707   emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4708
4709   emit_jump_insn (gen_jump (end_label));
4710   emit_barrier ();
4711   emit_label (else_label);
4712   emit_move_insn (dst, default_value);
4713   emit_label (end_label);
4714 }
4715
4716 /* Return true if the LMUL of comparison less than or equal to one.  */
4717 bool
4718 cmp_lmul_le_one (machine_mode mode)
4719 {
4720   if (riscv_v_ext_vector_mode_p (mode))
4721     return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4722   else if (riscv_v_ext_vls_mode_p (mode))
4723     return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4724   return false;
4725 }
4726
4727 /* Return true if the LMUL of comparison greater than one.  */
4728 bool
4729 cmp_lmul_gt_one (machine_mode mode)
4730 {
4731   if (riscv_v_ext_vector_mode_p (mode))
4732     return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4733   else if (riscv_v_ext_vls_mode_p (mode))
4734     return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4735   return false;
4736 }
4737
4738 /* Return true if the VLS mode is legal. There are 2 cases here.
4739
4740    1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4741       is the highest priority choice and should not conflict with VLS modes.
4742    2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4743       VLS mode are smaller than the minimal vla.
4744
4745    Take vlen = 2048 as example for case 2.
4746
4747    Note: Below table based on vlen = 2048.
4748    +----------------------------------------------------+----------------------+
4749    | VLS mode                                           | VLA mode             |
4750    +----------------------------------------------------+----------------------+
4751    | Name       | Precision | Inner Precision | Enabled | Min mode  | Min bits |
4752    +------------+-----------+-----------------+---------+-----------+----------+
4753    | V1BI       |     1     |              1  | Yes     | RVVMF64BI |    32    |
4754    | V2BI       |     2     |              1  | Yes     | RVVMF64BI |    32    |
4755    | V4BI       |     4     |              1  | Yes     | RVVMF64BI |    32    |
4756    | V8BI       |     8     |              1  | Yes     | RVVMF64BI |    32    |
4757    | V16BI      |    16     |              1  | Yes     | RVVMF64BI |    32    |
4758    | V32BI      |    32     |              1  | NO      | RVVMF64BI |    32    |
4759    | V64BI      |    64     |              1  | NO      | RVVMF64BI |    32    |
4760    | ...        |   ...     |            ...  | ...     | RVVMF64BI |    32    |
4761    | V4096BI    |  4096     |              1  | NO      | RVVMF64BI |    32    |
4762    +------------+-----------+-----------------+---------+-----------+----------+
4763    | V1QI       |     8     |              8  | Yes     | RVVMF8QI  |   256    |
4764    | V2QI       |    16     |              8  | Yes     | RVVMF8QI  |   256    |
4765    | V4QI       |    32     |              8  | Yes     | RVVMF8QI  |   256    |
4766    | V8QI       |    64     |              8  | Yes     | RVVMF8QI  |   256    |
4767    | V16QI      |   128     |              8  | Yes     | RVVMF8QI  |   256    |
4768    | V32QI      |   256     |              8  | NO      | RVVMF8QI  |   256    |
4769    | V64QI      |   512     |              8  | NO      | RVVMF8QI  |   256    |
4770    | ...        |   ...     |              .. | ...     | RVVMF8QI  |   256    |
4771    | V4096QI    | 32768     |              8  | NO      | RVVMF8QI  |   256    |
4772    +------------+-----------+-----------------+---------+-----------+----------+
4773    | V1HI       |    16     |              16 | Yes     | RVVMF4HI  |   512    |
4774    | V2HI       |    32     |              16 | Yes     | RVVMF4HI  |   512    |
4775    | V4HI       |    64     |              16 | Yes     | RVVMF4HI  |   512    |
4776    | V8HI       |   128     |              16 | Yes     | RVVMF4HI  |   512    |
4777    | V16HI      |   256     |              16 | Yes     | RVVMF4HI  |   512    |
4778    | V32HI      |   512     |              16 | NO      | RVVMF4HI  |   512    |
4779    | V64HI      |  1024     |              16 | NO      | RVVMF4HI  |   512    |
4780    | ...        |   ...     |              .. | ...     | RVVMF4HI  |   512    |
4781    | V2048HI    | 32768     |              16 | NO      | RVVMF4HI  |   512    |
4782    +------------+-----------+-----------------+---------+-----------+----------+
4783    | V1SI/SF    |    32     |              32 | Yes     | RVVMF2SI  |  1024    |
4784    | V2SI/SF    |    64     |              32 | Yes     | RVVMF2SI  |  1024    |
4785    | V4SI/SF    |   128     |              32 | Yes     | RVVMF2SI  |  1024    |
4786    | V8SI/SF    |   256     |              32 | Yes     | RVVMF2SI  |  1024    |
4787    | V16SI/SF   |   512     |              32 | Yes     | RVVMF2SI  |  1024    |
4788    | V32SI/SF   |  1024     |              32 | NO      | RVVMF2SI  |  1024    |
4789    | V64SI/SF   |  2048     |              32 | NO      | RVVMF2SI  |  1024    |
4790    | ...        |   ...     |              .. | ...     | RVVMF2SI  |  1024    |
4791    | V1024SI/SF | 32768     |              32 | NO      | RVVMF2SI  |  1024    |
4792    +------------+-----------+-----------------+---------+-----------+----------+
4793    | V1DI/DF    |    64     |              64 | Yes     | RVVM1DI   |  2048    |
4794    | V2DI/DF    |   128     |              64 | Yes     | RVVM1DI   |  2048    |
4795    | V4DI/DF    |   256     |              64 | Yes     | RVVM1DI   |  2048    |
4796    | V8DI/DF    |   512     |              64 | Yes     | RVVM1DI   |  2048    |
4797    | V16DI/DF   |  1024     |              64 | Yes     | RVVM1DI   |  2048    |
4798    | V32DI/DF   |  2048     |              64 | NO      | RVVM1DI   |  2048    |
4799    | V64DI/DF   |  4096     |              64 | NO      | RVVM1DI   |  2048    |
4800    | ...        |   ...     |              .. | ...     | RVVM1DI   |  2048    |
4801    | V512DI/DF  | 32768     |              64 | NO      | RVVM1DI   |  2048    |
4802    +------------+-----------+-----------------+---------+-----------+----------+
4803
4804    Then we can have the condition for VLS mode in fixed-vlmax, aka:
4805      PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)).  */
4806 bool
4807 vls_mode_valid_p (machine_mode vls_mode)
4808 {
4809   if (!TARGET_VECTOR || TARGET_XTHEADVECTOR)
4810     return false;
4811
4812   if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE)
4813     {
4814       if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4815           && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4816                          GET_MODE_PRECISION (vls_mode)))
4817         /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4818            BITS_PER_RISCV_VECTOR.
4819
4820            e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4821            We enable VLS modes have fixed size <= 128bit.  Since ordered_p is
4822            false between VLA modes with size = (128, 128) bits and VLS mode
4823            with size = 128 bits, we will end up with multiple ICEs in
4824            middle-end generic codes.  */
4825         return false;
4826       return true;
4827     }
4828
4829   if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
4830     {
4831       machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4832       int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4833       int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4834
4835       return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4836     }
4837
4838   return false;
4839 }
4840
4841 /* We don't have to convert the floating point to integer when the
4842    mantissa is zero.  Thus, ther will be a limitation for both the
4843    single and double precision floating point.  There will be no
4844    mantissa if the floating point is greater than the limit.
4845
4846    1. Half floating point.
4847       +-----------+---------------+
4848       | float     | binary layout |
4849       +-----------+---------------+
4850       | 1023.5    | 0x63ff        |
4851       +-----------+---------------+
4852       | 1024.0    | 0x6400        |
4853       +-----------+---------------+
4854       | 1025.0    | 0x6401        |
4855       +-----------+---------------+
4856       | ...       | ...           |
4857
4858       All half floating point will be unchanged for ceil if it is
4859       greater than and equal to 1024.
4860
4861    2. Single floating point.
4862       +-----------+---------------+
4863       | float     | binary layout |
4864       +-----------+---------------+
4865       | 8388607.5 | 0x4affffff    |
4866       +-----------+---------------+
4867       | 8388608.0 | 0x4b000000    |
4868       +-----------+---------------+
4869       | 8388609.0 | 0x4b000001    |
4870       +-----------+---------------+
4871       | ...       | ...           |
4872
4873       All single floating point will be unchanged for ceil if it is
4874       greater than and equal to 8388608.
4875
4876    3. Double floating point.
4877       +--------------------+--------------------+
4878       | float              | binary layout      |
4879       +--------------------+--------------------+
4880       | 4503599627370495.5 | 0X432fffffffffffff |
4881       +--------------------+--------------------+
4882       | 4503599627370496.0 | 0X4330000000000000 |
4883       +--------------------+--------------------+
4884       | 4503599627370497.0 | 0X4340000000000000 |
4885       +--------------------+--------------------+
4886       | ...                | ...                |
4887
4888       All double floating point will be unchanged for ceil if it is
4889       greater than and equal to 4503599627370496.
4890  */
4891 rtx
4892 get_fp_rounding_coefficient (machine_mode inner_mode)
4893 {
4894   REAL_VALUE_TYPE real;
4895
4896   if (inner_mode == E_HFmode)
4897     real_from_integer (&real, inner_mode, 1024, SIGNED);
4898   else if (inner_mode == E_SFmode)
4899     real_from_integer (&real, inner_mode, 8388608, SIGNED);
4900   else if (inner_mode == E_DFmode)
4901     real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4902   else
4903     gcc_unreachable ();
4904
4905   return const_double_from_real_value (real, inner_mode);
4906 }
4907
4908 static rtx
4909 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4910                          machine_mode vec_fp_mode)
4911 {
4912   /* Step-1: Prepare the scalar float compare register.  */
4913   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4914   emit_insn (gen_move_insn (fp_reg, fp_scalar));
4915
4916   /* Step-2: Generate the mask.  */
4917   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4918   rtx mask = gen_reg_rtx (mask_mode);
4919   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4920   rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4921   insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4922   emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4923
4924   return mask;
4925 }
4926
4927 static void
4928 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4929                    machine_mode vec_mode)
4930 {
4931   rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4932   insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4933
4934   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4935 }
4936
4937 static void
4938 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4939 {
4940   rtx abs_ops[] = {op_dest, op_src};
4941   insn_code icode = code_for_pred (ABS, vec_mode);
4942
4943   emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4944 }
4945
4946 static void
4947 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4948                   insn_type type, machine_mode vec_mode)
4949 {
4950   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4951
4952   if (type & USE_VUNDEF_MERGE_P)
4953     {
4954       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4955       emit_vlmax_insn (icode, type, cvt_x_ops);
4956     }
4957   else
4958     {
4959       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4960       emit_vlmax_insn (icode, type, cvt_x_ops);
4961     }
4962 }
4963
4964 static void
4965 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4966                   machine_mode vec_mode)
4967 {
4968   rtx ops[] = {op_dest, op_src};
4969   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4970
4971   emit_vlmax_insn (icode, type, ops);
4972 }
4973
4974 static void
4975 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4976                          machine_mode vec_mode)
4977 {
4978   rtx ops[] = {op_dest, op_src};
4979   insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4980
4981   emit_vlmax_insn (icode, type, ops);
4982 }
4983
4984 static void
4985 emit_vec_widen_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4986                          machine_mode vec_mode)
4987 {
4988   rtx ops[] = {op_dest, op_src};
4989   insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4990
4991   emit_vlmax_insn (icode, type, ops);
4992 }
4993
4994 static void
4995 emit_vec_widen_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4996                          machine_mode vec_mode)
4997 {
4998   rtx ops[] = {op_dest, op_src};
4999   insn_code icode = code_for_pred_extend (vec_mode);
5000
5001   emit_vlmax_insn (icode, type, ops);
5002 }
5003
5004 static void
5005 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
5006                   insn_type type, machine_mode vec_mode)
5007 {
5008   rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
5009   insn_code icode = code_for_pred (FLOAT, vec_mode);
5010
5011   emit_vlmax_insn (icode, type, cvt_fp_ops);
5012 }
5013
5014 static void
5015 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
5016                       insn_type type, machine_mode vec_mode)
5017 {
5018   insn_code icode = code_for_pred (FIX, vec_mode);
5019
5020   if (type & USE_VUNDEF_MERGE_P)
5021     {
5022       rtx cvt_x_ops[] = {op_dest, mask, op_src};
5023       emit_vlmax_insn (icode, type, cvt_x_ops);
5024     }
5025   else
5026     {
5027       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
5028       emit_vlmax_insn (icode, type, cvt_x_ops);
5029     }
5030 }
5031
5032 static void
5033 emit_vec_binary_alu (rtx op_dest, rtx op_1, rtx op_2, enum rtx_code rcode,
5034                      machine_mode vec_mode)
5035 {
5036   rtx ops[] = {op_dest, op_1, op_2};
5037   insn_code icode = code_for_pred (rcode, vec_mode);
5038
5039   emit_vlmax_insn (icode, BINARY_OP, ops);
5040 }
5041
5042 void
5043 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5044                  machine_mode vec_int_mode)
5045 {
5046   /* Step-1: Get the abs float value for mask generation.  */
5047   emit_vec_abs (op_0, op_1, vec_fp_mode);
5048
5049   /* Step-2: Generate the mask on const fp.  */
5050   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5051   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5052
5053   /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
5054   rtx tmp = gen_reg_rtx (vec_int_mode);
5055   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
5056
5057   /* Step-4: Convert to floating-point on mask for the final result.
5058      To avoid unnecessary frm register access, we use RUP here and it will
5059      never do the rounding up because the tmp rtx comes from the float
5060      to int conversion.  */
5061   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
5062
5063   /* Step-5: Retrieve the sign bit for -0.0.  */
5064   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5065 }
5066
5067 void
5068 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5069                   machine_mode vec_int_mode)
5070 {
5071   /* Step-1: Get the abs float value for mask generation.  */
5072   emit_vec_abs (op_0, op_1, vec_fp_mode);
5073
5074   /* Step-2: Generate the mask on const fp.  */
5075   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5076   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5077
5078   /* Step-3: Convert to integer on mask, with rounding down (aka floor).  */
5079   rtx tmp = gen_reg_rtx (vec_int_mode);
5080   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
5081
5082   /* Step-4: Convert to floating-point on mask for the floor result.  */
5083   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
5084
5085   /* Step-5: Retrieve the sign bit for -0.0.  */
5086   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5087 }
5088
5089 void
5090 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5091                       machine_mode vec_int_mode)
5092 {
5093   /* Step-1: Get the abs float value for mask generation.  */
5094   emit_vec_abs (op_0, op_1, vec_fp_mode);
5095
5096   /* Step-2: Generate the mask on const fp.  */
5097   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5098   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5099
5100   /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
5101   rtx fflags = gen_reg_rtx (SImode);
5102   emit_insn (gen_riscv_frflags (fflags));
5103
5104   /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint).  */
5105   rtx tmp = gen_reg_rtx (vec_int_mode);
5106   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
5107
5108   /* Step-5: Convert to floating-point on mask for the nearbyint result.  */
5109   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5110
5111   /* Step-6: Restore FP exception flags. */
5112   emit_insn (gen_riscv_fsflags (fflags));
5113
5114   /* Step-7: Retrieve the sign bit for -0.0.  */
5115   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5116 }
5117
5118 void
5119 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5120                  machine_mode vec_int_mode)
5121 {
5122   /* Step-1: Get the abs float value for mask generation.  */
5123   emit_vec_abs (op_0, op_1, vec_fp_mode);
5124
5125   /* Step-2: Generate the mask on const fp.  */
5126   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5127   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5128
5129   /* Step-3: Convert to integer on mask, with dyn rounding (aka rint).  */
5130   rtx tmp = gen_reg_rtx (vec_int_mode);
5131   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
5132
5133   /* Step-4: Convert to floating-point on mask for the rint result.  */
5134   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5135
5136   /* Step-5: Retrieve the sign bit for -0.0.  */
5137   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5138 }
5139
5140 void
5141 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5142                   machine_mode vec_int_mode)
5143 {
5144   /* Step-1: Get the abs float value for mask generation.  */
5145   emit_vec_abs (op_0, op_1, vec_fp_mode);
5146
5147   /* Step-2: Generate the mask on const fp.  */
5148   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5149   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5150
5151   /* Step-3: Convert to integer on mask, rounding to nearest (aka round).  */
5152   rtx tmp = gen_reg_rtx (vec_int_mode);
5153   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
5154
5155   /* Step-4: Convert to floating-point on mask for the round result.  */
5156   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
5157
5158   /* Step-5: Retrieve the sign bit for -0.0.  */
5159   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5160 }
5161
5162 void
5163 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5164                   machine_mode vec_int_mode)
5165 {
5166   /* Step-1: Get the abs float value for mask generation.  */
5167   emit_vec_abs (op_0, op_1, vec_fp_mode);
5168
5169   /* Step-2: Generate the mask on const fp.  */
5170   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5171   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5172
5173   /* Step-3: Convert to integer on mask, rounding to zero (aka truncate).  */
5174   rtx tmp = gen_reg_rtx (vec_int_mode);
5175   emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
5176
5177   /* Step-4: Convert to floating-point on mask for the rint result.  */
5178   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5179
5180   /* Step-5: Retrieve the sign bit for -0.0.  */
5181   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5182 }
5183
5184 void
5185 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5186                       machine_mode vec_int_mode)
5187 {
5188   /* Step-1: Get the abs float value for mask generation.  */
5189   emit_vec_abs (op_0, op_1, vec_fp_mode);
5190
5191   /* Step-2: Generate the mask on const fp.  */
5192   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5193   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5194
5195   /* Step-3: Convert to integer on mask, rounding to nearest, ties to even.  */
5196   rtx tmp = gen_reg_rtx (vec_int_mode);
5197   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
5198
5199   /* Step-4: Convert to floating-point on mask for the rint result.  */
5200   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
5201
5202   /* Step-5: Retrieve the sign bit for -0.0.  */
5203   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5204 }
5205
5206 /* Handling the rounding from floating-point to int/long/long long.  */
5207 static void
5208 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
5209                               machine_mode vec_fp_mode,
5210                               machine_mode vec_int_mode,
5211                               machine_mode vec_bridge_mode = E_VOIDmode)
5212 {
5213   poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
5214   poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
5215
5216   if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI.  */
5217     emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
5218   else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI.  */
5219     emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
5220   else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI.  */
5221     emit_vec_widen_cvt_x_f (op_0, op_1, type, vec_int_mode);
5222   else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI.  */
5223     {
5224       gcc_assert (vec_bridge_mode != E_VOIDmode);
5225
5226       rtx op_sf = gen_reg_rtx (vec_bridge_mode);
5227
5228       /* Step-1: HF => SF, no rounding here.  */
5229       emit_vec_widen_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
5230       /* Step-2: SF => DI.  */
5231       emit_vec_widen_cvt_x_f (op_0, op_sf, type, vec_int_mode);
5232     }
5233   else
5234     gcc_unreachable ();
5235 }
5236
5237 void
5238 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5239                   machine_mode vec_int_mode, machine_mode vec_bridge_mode)
5240 {
5241   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
5242                                 vec_int_mode, vec_bridge_mode);
5243 }
5244
5245 void
5246 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5247                    machine_mode vec_int_mode, machine_mode vec_bridge_mode)
5248 {
5249   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
5250                                 vec_int_mode, vec_bridge_mode);
5251 }
5252
5253 void
5254 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5255                   machine_mode vec_int_mode)
5256 {
5257   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
5258                                 vec_int_mode);
5259 }
5260
5261 void
5262 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5263                    machine_mode vec_int_mode)
5264 {
5265   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
5266                                 vec_int_mode);
5267 }
5268
5269 /* Expand the standard name usadd<mode>3 for vector mode,  we can leverage
5270    the vector fixed point vector single-width saturating add directly.  */
5271
5272 void
5273 expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5274 {
5275   emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode);
5276 }
5277
5278 /* Expand the standard name ssadd<mode>3 for vector mode,  we can leverage
5279    the vector fixed point vector single-width saturating add directly.  */
5280
5281 void
5282 expand_vec_ssadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5283 {
5284   emit_vec_binary_alu (op_0, op_1, op_2, SS_PLUS, vec_mode);
5285 }
5286
5287 /* Expand the standard name usadd<mode>3 for vector mode,  we can leverage
5288    the vector fixed point vector single-width saturating add directly.  */
5289
5290 void
5291 expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5292 {
5293   emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
5294 }
5295
5296 /* Expand the standard name ssadd<mode>3 for vector mode,  we can leverage
5297    the vector fixed point vector single-width saturating add directly.  */
5298
5299 void
5300 expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5301 {
5302   emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode);
5303 }
5304
5305 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
5306    DI => SI.  we can leverage the vector fixed point vector narrowing
5307    fixed-point clip directly.  */
5308
5309 void
5310 expand_vec_double_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5311 {
5312   insn_code icode;
5313   rtx zero = CONST0_RTX (Xmode);
5314   enum unspec unspec = UNSPEC_VNCLIPU;
5315   rtx ops[] = {op_0, op_1, zero};
5316
5317   icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5318   emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5319 }
5320
5321 /* Expand the standard name sstrunc<m><n>2 for double vector mode,  like
5322    DI => SI.  we can leverage the vector fixed point vector narrowing
5323    fixed-point clip directly.  */
5324
5325 void
5326 expand_vec_double_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5327 {
5328   insn_code icode;
5329   rtx zero = CONST0_RTX (Xmode);
5330   enum unspec unspec = UNSPEC_VNCLIP;
5331   rtx ops[] = {op_0, op_1, zero};
5332
5333   icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5334   emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5335 }
5336
5337 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
5338    DI => HI.  we can leverage the vector fixed point vector narrowing
5339    fixed-point clip directly.  */
5340
5341 void
5342 expand_vec_quad_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5343                          machine_mode double_mode)
5344 {
5345   rtx double_rtx = gen_reg_rtx (double_mode);
5346
5347   expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5348   expand_vec_double_ustrunc (op_0, double_rtx, double_mode);
5349 }
5350
5351 /* Expand the standard name sstrunc<m><n>2 for quad vector mode,  like
5352    DI => HI.  we can leverage the vector fixed point vector narrowing
5353    fixed-point clip directly.  */
5354
5355 void
5356 expand_vec_quad_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5357                          machine_mode double_mode)
5358 {
5359   rtx double_rtx = gen_reg_rtx (double_mode);
5360
5361   expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5362   expand_vec_double_sstrunc (op_0, double_rtx, double_mode);
5363 }
5364
5365 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
5366    DI => QI.  we can leverage the vector fixed point vector narrowing
5367    fixed-point clip directly.  */
5368
5369 void
5370 expand_vec_oct_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5371                         machine_mode double_mode, machine_mode quad_mode)
5372 {
5373   rtx double_rtx = gen_reg_rtx (double_mode);
5374   rtx quad_rtx = gen_reg_rtx (quad_mode);
5375
5376   expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5377   expand_vec_double_ustrunc (quad_rtx, double_rtx, double_mode);
5378   expand_vec_double_ustrunc (op_0, quad_rtx, quad_mode);
5379 }
5380
5381 /* Expand the standard name sstrunc<m><n>2 for oct vector mode,  like
5382    DI => QI.  we can leverage the vector fixed point vector narrowing
5383    fixed-point clip directly.  */
5384
5385 void
5386 expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5387                         machine_mode double_mode, machine_mode quad_mode)
5388 {
5389   rtx double_rtx = gen_reg_rtx (double_mode);
5390   rtx quad_rtx = gen_reg_rtx (quad_mode);
5391
5392   expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5393   expand_vec_double_sstrunc (quad_rtx, double_rtx, double_mode);
5394   expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
5395 }
5396
5397 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5398    well.  */
5399 void
5400 expand_popcount (rtx *ops)
5401 {
5402   rtx dst = ops[0];
5403   rtx src = ops[1];
5404   machine_mode mode = GET_MODE (dst);
5405   scalar_mode imode = GET_MODE_INNER (mode);
5406   static const uint64_t m5 = 0x5555555555555555ULL;
5407   static const uint64_t m3 = 0x3333333333333333ULL;
5408   static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
5409   static const uint64_t m1 = 0x0101010101010101ULL;
5410
5411   rtx x1 = gen_reg_rtx (mode);
5412   rtx x2 = gen_reg_rtx (mode);
5413   rtx x3 = gen_reg_rtx (mode);
5414   rtx x4 = gen_reg_rtx (mode);
5415
5416   /* x1 = src - (src >> 1) & 0x555...);  */
5417   rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
5418                              OPTAB_DIRECT);
5419
5420   rtx and1 = gen_reg_rtx (mode);
5421   rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
5422   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5423                    ops1);
5424
5425   x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
5426
5427   /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5428    */
5429   rtx and2 = gen_reg_rtx (mode);
5430   rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
5431   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5432                    ops2);
5433
5434   rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
5435                              OPTAB_DIRECT);
5436
5437   rtx and22 = gen_reg_rtx (mode);
5438   rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
5439   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5440                    ops22);
5441
5442   x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
5443
5444   /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL;  */
5445   rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
5446                              OPTAB_DIRECT);
5447
5448   rtx plus3
5449     = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
5450
5451   rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
5452   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5453                    ops3);
5454
5455   /* dest = (x3 * 0x0101010101010101ULL) >> 56;  */
5456   rtx mul4 = gen_reg_rtx (mode);
5457   rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
5458   emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
5459                    ops4);
5460
5461   x4 = expand_binop (mode, lshr_optab, mul4,
5462                      GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
5463                      OPTAB_DIRECT);
5464
5465   emit_move_insn (dst, x4);
5466 }
5467
5468 /* Return true if it is VLMAX AVL TYPE.  */
5469 bool
5470 vlmax_avl_type_p (rtx_insn *rinsn)
5471 {
5472   extract_insn_cached (rinsn);
5473   int index = get_attr_avl_type_idx (rinsn);
5474   if (index == INVALID_ATTRIBUTE)
5475     return false;
5476
5477   gcc_assert (index < recog_data.n_operands);
5478
5479   rtx avl_type = recog_data.operand[index];
5480   return INTVAL (avl_type) == VLMAX;
5481 }
5482
5483 /* Return true if it is an RVV instruction depends on VL global
5484    status register.  */
5485 bool
5486 has_vl_op (rtx_insn *rinsn)
5487 {
5488   return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
5489 }
5490
5491 /* Get default tail policy.  */
5492 static bool
5493 get_default_ta ()
5494 {
5495   /* For the instruction that doesn't require TA, we still need a default value
5496      to emit vsetvl. We pick up the default value according to prefer policy. */
5497   return (bool) (get_prefer_tail_policy () & 0x1
5498                  || (get_prefer_tail_policy () >> 1 & 0x1));
5499 }
5500
5501 /* Helper function to get TA operand.  */
5502 bool
5503 tail_agnostic_p (rtx_insn *rinsn)
5504 {
5505   /* If it doesn't have TA, we return agnostic by default.  */
5506   extract_insn_cached (rinsn);
5507   int ta = get_attr_ta (rinsn);
5508   return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
5509 }
5510
5511 /* Change insn and Assert the change always happens.  */
5512 void
5513 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
5514 {
5515   bool change_p = validate_change (object, loc, new_rtx, in_group);
5516   gcc_assert (change_p);
5517 }
5518
5519 /* Return true if it is NONVLMAX AVL TYPE.  */
5520 bool
5521 nonvlmax_avl_type_p (rtx_insn *rinsn)
5522 {
5523   extract_insn_cached (rinsn);
5524   int index = get_attr_avl_type_idx (rinsn);
5525   if (index == INVALID_ATTRIBUTE)
5526     return false;
5527
5528   gcc_assert (index < recog_data.n_operands);
5529
5530   rtx avl_type = recog_data.operand[index];
5531   return INTVAL (avl_type) == NONVLMAX;
5532 }
5533
5534 /* Return true if RTX is RVV VLMAX AVL.  */
5535 bool
5536 vlmax_avl_p (rtx x)
5537 {
5538   return x && rtx_equal_p (x, RVV_VLMAX);
5539 }
5540
5541 /* Helper function to get SEW operand. We always have SEW value for
5542    all RVV instructions that have VTYPE OP.  */
5543 uint8_t
5544 get_sew (rtx_insn *rinsn)
5545 {
5546   return get_attr_sew (rinsn);
5547 }
5548
5549 /* Helper function to get VLMUL operand. We always have VLMUL value for
5550    all RVV instructions that have VTYPE OP. */
5551 enum vlmul_type
5552 get_vlmul (rtx_insn *rinsn)
5553 {
5554   return (enum vlmul_type) get_attr_vlmul (rinsn);
5555 }
5556
5557 /* Count the number of REGNO in RINSN.  */
5558 int
5559 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5560 {
5561   int count = 0;
5562   extract_insn (rinsn);
5563   for (int i = 0; i < recog_data.n_operands; i++)
5564     if (refers_to_regno_p (regno, recog_data.operand[i]))
5565       count++;
5566   return count;
5567 }
5568
5569 /* Return true if the OP can be directly broadcasted.  */
5570 bool
5571 can_be_broadcasted_p (rtx op)
5572 {
5573   machine_mode mode = GET_MODE (op);
5574   /* We don't allow RA (register allocation) reload generate
5575     (vec_duplicate:DI reg) in RV32 system wheras we allow
5576     (vec_duplicate:DI mem) in RV32 system.  */
5577   if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5578       && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5579       && !satisfies_constraint_Wdm (op))
5580     return false;
5581
5582   if (satisfies_constraint_K (op) || register_operand (op, mode)
5583       || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5584     return true;
5585
5586   return can_create_pseudo_p () && nonmemory_operand (op, mode);
5587 }
5588
5589 void
5590 emit_vec_extract (rtx target, rtx src, rtx index)
5591 {
5592   machine_mode vmode = GET_MODE (src);
5593   machine_mode smode = GET_MODE (target);
5594   class expand_operand ops[3];
5595   enum insn_code icode
5596     = convert_optab_handler (vec_extract_optab, vmode, smode);
5597   gcc_assert (icode != CODE_FOR_nothing);
5598   create_output_operand (&ops[0], target, smode);
5599   ops[0].target = 1;
5600   create_input_operand (&ops[1], src, vmode);
5601
5602   poly_int64 val;
5603   if (poly_int_rtx_p (index, &val))
5604     create_integer_operand (&ops[2], val);
5605   else
5606     create_input_operand (&ops[2], index, Pmode);
5607
5608   expand_insn (icode, 3, ops);
5609   if (ops[0].value != target)
5610     emit_move_insn (target, ops[0].value);
5611 }
5612
5613 /* Return true if the offset mode is valid mode that we use for gather/scatter
5614    autovectorization.  */
5615 bool
5616 gather_scatter_valid_offset_p (machine_mode mode)
5617 {
5618   /* If the element size of offset mode is already >= Pmode size,
5619      we don't need any extensions.  */
5620   if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5621     return true;
5622
5623   /* Since we are very likely extend the offset mode into vector Pmode,
5624      Disable gather/scatter autovectorization if we can't extend the offset
5625      mode into vector Pmode.  */
5626   if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5627     return false;
5628   return true;
5629 }
5630
5631 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5632    Look into the tuning structure for an estimate.
5633    KIND specifies the type of requested estimate: min, max or likely.
5634    For cores with a known VLA width all three estimates are the same.
5635    For generic VLA tuning we want to distinguish the maximum estimate from
5636    the minimum and likely ones.
5637    The likely estimate is the same as the minimum in that case to give a
5638    conservative behavior of auto-vectorizing with VLA when it is a win
5639    even for VLA vectorization.
5640    When VLA width information is available VAL.coeffs[1] is multiplied by
5641    the number of VLA chunks over the initial VLS bits.  */
5642 HOST_WIDE_INT
5643 estimated_poly_value (poly_int64 val, unsigned int kind)
5644 {
5645   unsigned int width_source
5646     = BITS_PER_RISCV_VECTOR.is_constant ()
5647         ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5648         : (unsigned int) RVV_VECTOR_BITS_SCALABLE;
5649
5650   /* If there is no core-specific information then the minimum and likely
5651      values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5652      the architectural maximum of 65536 bits.  */
5653   unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5654   if (width_source == RVV_VECTOR_BITS_SCALABLE)
5655     switch (kind)
5656       {
5657       case POLY_VALUE_MIN:
5658       case POLY_VALUE_LIKELY:
5659         return val.coeffs[0];
5660
5661       case POLY_VALUE_MAX:
5662         return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5663       }
5664
5665   /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5666      lowest as likely.  This could be made more general if future -mtune
5667      options need it to be.  */
5668   if (kind == POLY_VALUE_MAX)
5669     width_source = 1 << floor_log2 (width_source);
5670   else
5671     width_source = least_bit_hwi (width_source);
5672
5673   /* If the core provides width information, use that.  */
5674   HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5675   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5676 }
5677
5678 /* Return true it is whole register-register move.  */
5679 bool
5680 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
5681 {
5682   /* An operation is a whole-register move if either
5683      (1) Its vlmax operand equals VLMAX
5684      (2) Its vl operand equals the number of units of its mode.  */
5685   if (register_operand (ops[0], mode)
5686       && register_operand (ops[3], mode)
5687       && satisfies_constraint_vu (ops[2])
5688       && satisfies_constraint_Wc1 (ops[1]))
5689     {
5690       if (INTVAL (ops[avl_type_index]) == VLMAX)
5691         return true;
5692       /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5693          into NON-VLMAX with LEN = NUNITS.  */
5694       else if (CONST_INT_P (ops[4])
5695                && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
5696         return true;
5697     }
5698   return false;
5699 }
5700
5701 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
5702 bool
5703 splat_to_scalar_move_p (rtx *ops)
5704 {
5705   return satisfies_constraint_Wc1 (ops[1])
5706          && satisfies_constraint_vu (ops[2])
5707          && !MEM_P (ops[3])
5708          && satisfies_constraint_k01 (ops[4])
5709          && INTVAL (ops[7]) == NONVLMAX
5710          && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
5711 }
5712
5713 } // namespace riscv_vector