[PR testsuite/116860] Testsuite adjustment for recently added tests
[official-gcc.git] / gcc / config / riscv / riscv-v.cc
blob9847439ca779cbb48a29e485d58e035f40d544d5
1 /* Subroutines used for code generation for RISC-V 'V' Extension for
2 GNU compiler.
3 Copyright (C) 2022-2025 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
25 the vector.md. */
26 #define RVV_INSN_OPERANDS_MAX 11
28 #include "config.h"
29 #include "system.h"
30 #include "coretypes.h"
31 #include "tm.h"
32 #include "backend.h"
33 #include "rtl.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
36 #include "recog.h"
37 #include "alias.h"
38 #include "tree.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "explow.h"
42 #include "memmodel.h"
43 #include "emit-rtl.h"
44 #include "tm_p.h"
45 #include "target.h"
46 #include "targhooks.h"
47 #include "expr.h"
48 #include "optabs.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
52 #include "predict.h"
53 #include "errors.h"
54 #include "riscv-v.h"
56 using namespace riscv_vector;
58 namespace riscv_vector {
60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli. */
61 bool
62 imm_avl_p (machine_mode mode)
64 poly_uint64 nunits = GET_MODE_NUNITS (mode);
66 return nunits.is_constant ()
67 /* The vsetivli can only hold register 0~31. */
68 ? (IN_RANGE (nunits.to_constant (), 0, 31))
69 /* Only allowed in VLS-VLMAX mode. */
70 : false;
73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
74 static bool
75 is_vlmax_len_p (machine_mode mode, rtx len)
77 poly_int64 value;
78 return poly_int_rtx_p (len, &value)
79 && known_eq (value, GET_MODE_NUNITS (mode));
82 /* Helper functions for insn_flags && insn_types */
84 /* Return true if caller need pass mask operand for insn pattern with
85 INSN_FLAGS. */
87 static bool
88 need_mask_operand_p (unsigned insn_flags)
90 return (insn_flags & HAS_MASK_P)
91 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
94 template <int MAX_OPERANDS> class insn_expander
96 public:
97 insn_expander () = delete;
99 insn_expander (unsigned insn_flags, bool vlmax_p)
100 : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
101 m_vl_op (NULL_RTX)
103 check_insn_flags ();
106 void check_insn_flags () const
108 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
109 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
110 gcc_assert ((m_insn_flags & HAS_MASK_P));
112 if (m_insn_flags & USE_ALL_TRUES_MASK_P)
113 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
114 gcc_assert ((m_insn_flags & HAS_MASK_P));
116 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
117 gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
118 && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
120 if (m_insn_flags & USE_VUNDEF_MERGE_P)
121 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
122 gcc_assert ((m_insn_flags & HAS_MERGE_P));
124 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
125 gcc_assert (
126 !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
128 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
129 gcc_assert (
130 !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
132 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
133 exclusive. */
134 gcc_assert (
135 !((m_insn_flags & NULLARY_OP_P)
136 && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
137 || (m_insn_flags & TERNARY_OP_P))));
138 gcc_assert (
139 !((m_insn_flags & UNARY_OP_P)
140 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
141 || (m_insn_flags & TERNARY_OP_P))));
142 gcc_assert (
143 !((m_insn_flags & BINARY_OP_P)
144 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
145 || (m_insn_flags & TERNARY_OP_P))));
146 gcc_assert (
147 !((m_insn_flags & TERNARY_OP_P)
148 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
149 || (m_insn_flags & BINARY_OP_P))));
152 void set_vl (rtx vl) { m_vl_op = vl; }
154 void add_output_operand (rtx x, machine_mode mode)
156 create_output_operand (&m_ops[m_opno++], x, mode);
157 gcc_assert (m_opno <= MAX_OPERANDS);
159 void add_input_operand (rtx x, machine_mode mode)
161 create_input_operand (&m_ops[m_opno++], x, mode);
162 gcc_assert (m_opno <= MAX_OPERANDS);
164 void add_all_one_mask_operand (machine_mode mask_mode)
166 add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
168 void add_first_one_true_mask_operand (machine_mode mask_mode)
170 add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
172 void add_vundef_operand (machine_mode dest_mode)
174 add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
176 void add_policy_operand ()
178 if (m_insn_flags & TU_POLICY_P)
180 rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
181 add_input_operand (tail_policy_rtx, Pmode);
183 else if (m_insn_flags & TDEFAULT_POLICY_P)
185 rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
186 add_input_operand (tail_policy_rtx, Pmode);
189 if (m_insn_flags & MU_POLICY_P)
191 rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
192 add_input_operand (mask_policy_rtx, Pmode);
194 else if (m_insn_flags & MDEFAULT_POLICY_P)
196 rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
197 add_input_operand (mask_policy_rtx, Pmode);
200 void add_avl_type_operand (avl_type type)
202 add_input_operand (gen_int_mode (type, Pmode), Pmode);
205 void
206 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
208 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
209 add_input_operand (frm_rtx, Pmode);
212 void
213 add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
215 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
216 add_input_operand (frm_rtx, Pmode);
219 /* Return the vtype mode based on insn_flags.
220 vtype mode mean the mode vsetvl insn set. */
221 machine_mode
222 get_vtype_mode (rtx *ops)
224 machine_mode vtype_mode;
225 if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
226 vtype_mode = GET_MODE (ops[1]);
227 else
228 vtype_mode = GET_MODE (ops[0]);
229 return vtype_mode;
232 void emit_insn (enum insn_code icode, rtx *ops)
234 int opno = 0;
235 int num_ops;
236 /* It's true if any operand is memory operand. */
237 bool any_mem_p = false;
239 machine_mode vtype_mode = get_vtype_mode (ops);
240 machine_mode mask_mode = get_mask_mode (vtype_mode);
242 /* Add dest operand. */
243 if (m_insn_flags & HAS_DEST_P)
245 rtx op = ops[opno++];
246 any_mem_p |= MEM_P (op);
247 add_output_operand (op, GET_MODE (op));
250 /* Add mask operand. */
251 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
252 add_first_one_true_mask_operand (mask_mode);
253 else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
254 add_all_one_mask_operand (mask_mode);
255 else if (m_insn_flags & HAS_MASK_P)
257 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
258 gcc_assert (mode != VOIDmode);
259 add_input_operand (ops[opno++], mode);
262 /* Add merge operand. */
263 if (m_insn_flags & USE_VUNDEF_MERGE_P)
264 /* Same as dest operand. */
265 add_vundef_operand (GET_MODE (ops[0]));
266 else if (m_insn_flags & HAS_MERGE_P)
268 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
269 gcc_assert (mode != VOIDmode);
270 add_input_operand (ops[opno++], mode);
273 if (m_insn_flags & NULLARY_OP_P)
274 num_ops = 0;
275 else if (m_insn_flags & UNARY_OP_P)
276 num_ops = 1;
277 else if (m_insn_flags & BINARY_OP_P)
278 num_ops = 2;
279 else if (m_insn_flags & TERNARY_OP_P)
280 num_ops = 3;
281 else
282 gcc_unreachable ();
284 /* Add the remain operands. */
285 for (; num_ops; num_ops--, opno++)
287 any_mem_p |= MEM_P (ops[opno]);
288 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
289 /* 'create_input_operand doesn't allow VOIDmode.
290 According to vector.md, we may have some patterns that do not have
291 explicit machine mode specifying the operand. Such operands are
292 always Pmode. */
293 if (mode == VOIDmode)
294 mode = Pmode;
296 /* Early assertion ensures same mode since maybe_legitimize_operand
297 will check this. */
298 machine_mode required_mode = GET_MODE (ops[opno]);
299 if (required_mode != VOIDmode && required_mode != mode)
300 internal_error ("expected mode %s for operand %d of "
301 "insn %s but got mode %s.\n",
302 GET_MODE_NAME (mode),
303 opno,
304 insn_data[(int) icode].name,
305 GET_MODE_NAME (required_mode));
307 add_input_operand (ops[opno], mode);
310 /* Add vl operand. */
311 rtx len = m_vl_op;
312 bool vls_p = false;
313 if (m_vlmax_p)
315 if (riscv_v_ext_vls_mode_p (vtype_mode))
317 /* VLS modes always set VSETVL by
318 "vsetvl zero, rs1/imm". */
319 poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
320 len = gen_int_mode (nunits, Pmode);
321 vls_p = true;
323 else if (can_create_pseudo_p ())
325 len = gen_reg_rtx (Pmode);
326 emit_vlmax_vsetvl (vtype_mode, len);
330 gcc_assert (len != NULL_RTX);
331 add_input_operand (len, Pmode);
333 /* Add tail and mask policy operands. */
334 add_policy_operand ();
336 /* Add avl_type operand. */
337 add_avl_type_operand (
338 vls_p ? avl_type::VLS
339 : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
341 /* Add rounding mode operand. */
342 if (m_insn_flags & FRM_DYN_P)
343 add_rounding_mode_operand (FRM_DYN);
344 else if (m_insn_flags & FRM_RUP_P)
345 add_rounding_mode_operand (FRM_RUP);
346 else if (m_insn_flags & FRM_RDN_P)
347 add_rounding_mode_operand (FRM_RDN);
348 else if (m_insn_flags & FRM_RMM_P)
349 add_rounding_mode_operand (FRM_RMM);
350 else if (m_insn_flags & FRM_RNE_P)
351 add_rounding_mode_operand (FRM_RNE);
352 else if (m_insn_flags & VXRM_RNU_P)
353 add_rounding_mode_operand (VXRM_RNU);
354 else if (m_insn_flags & VXRM_RDN_P)
355 add_rounding_mode_operand (VXRM_RDN);
358 if (insn_data[(int) icode].n_operands != m_opno)
359 internal_error ("invalid number of operands for insn %s, "
360 "expected %d but got %d.\n",
361 insn_data[(int) icode].name,
362 insn_data[(int) icode].n_operands, m_opno);
364 expand (icode, any_mem_p);
367 void expand (enum insn_code icode, bool temporary_volatile_p = false)
369 if (temporary_volatile_p)
371 temporary_volatile_ok v (true);
372 expand_insn (icode, m_opno, m_ops);
374 else
375 expand_insn (icode, m_opno, m_ops);
378 private:
379 unsigned m_insn_flags;
380 int m_opno;
381 bool m_vlmax_p;
382 rtx m_vl_op;
383 expand_operand m_ops[MAX_OPERANDS];
386 /* Emit an RVV insn with a vector length that equals the number of units of the
387 vector mode. For VLA modes this corresponds to VLMAX.
389 Unless the vector length can be encoded in the vsetivl[i] instruction this
390 function must only be used as long as we can create pseudo registers. This is
391 because it will set a pseudo register to VLMAX using vsetvl and use this as
392 definition for the vector length. */
393 void
394 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
396 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
397 gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
399 e.emit_insn ((enum insn_code) icode, ops);
402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
403 registers anymore. This function, however, takes a predefined vector length
404 from the value in VL. */
405 void
406 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
408 gcc_assert (!can_create_pseudo_p ());
409 machine_mode mode = GET_MODE (ops[0]);
411 if (imm_avl_p (mode))
413 /* Even though VL is a real hardreg already allocated since
414 it is post-RA now, we still gain benefits that we emit
415 vsetivli zero, imm instead of vsetvli VL, zero which is
416 we can be more flexible in post-RA instruction scheduling. */
417 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
418 e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
419 e.emit_insn ((enum insn_code) icode, ops);
421 else
423 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
424 e.set_vl (vl);
425 e.emit_insn ((enum insn_code) icode, ops);
429 /* Emit an RVV insn with a predefined vector length. Contrary to
430 emit_vlmax_insn the instruction's vector length is not deduced from its mode
431 but taken from the value in VL. */
432 void
433 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
435 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
436 e.set_vl (vl);
437 e.emit_insn ((enum insn_code) icode, ops);
440 /* Return true if the vector duplicated by a super element which is the fusion
441 of consecutive elements.
443 v = { a, b, a, b } super element = ab, v = { ab, ab } */
444 bool
445 rvv_builder::can_duplicate_repeating_sequence_p ()
447 poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
448 unsigned int new_inner_size = m_inner_bits_size * npatterns ();
449 if (m_inner_mode == Pmode
450 || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
451 || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
452 || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
453 return false;
454 return repeating_sequence_p (0, encoded_nelts (), npatterns ());
457 /* Return true if the vector is a simple sequence with one pattern and all
458 elements the same. */
459 bool
460 rvv_builder::is_repeating_sequence ()
462 if (npatterns () > 1)
463 return false;
464 return repeating_sequence_p (0, encoded_nelts (), 1);
467 /* Return true if it is a repeating sequence that using
468 merge approach has better codegen than using default
469 approach (slide1down).
471 Sequence A:
472 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
474 nelts = 16
475 npatterns = 2
477 for merging a we need mask 101010....
478 for merging b we need mask 010101....
480 Foreach element in the npattern, we need to build a mask in scalar register.
481 Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
482 instructions and 1 scalar move to v0 register. Finally we need vector merge
483 to merge them.
485 lui a5, #imm
486 add a5, #imm
487 vmov.s.x v0, a5
488 vmerge.vxm v9, v9, a1, v0
490 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
491 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
492 So return true in this case as it is profitable.
494 Sequence B:
495 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
497 nelts = 16
498 npatterns = 8
500 COST of merge approach = (3 + 1) * npatterns = 24
501 COST of slide1down approach = nelts = 16
502 Return false in this case as it is NOT profitable in merge approach.
504 bool
505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
507 if (inner_bytes_size () > UNITS_PER_WORD)
508 return false;
510 unsigned int nelts = full_nelts ().to_constant ();
512 if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
513 return false;
515 unsigned int merge_cost = 1;
516 unsigned int build_merge_mask_cost = 3;
517 unsigned int slide1down_cost = nelts;
519 return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
522 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
523 bool
524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
526 int nelts = full_nelts ().to_constant ();
527 int leading_ndups = this->count_dups (0, nelts - 1, 1);
528 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
530 /* ??? Current heuristic we do is we do combine 2 vectors
531 by slideup when:
532 1. # of leading same elements is equal to # of trailing same elements.
533 2. Both of above are equal to nelts / 2.
534 Otherwise, it is not profitable. */
535 return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
538 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
539 bool
540 rvv_builder::combine_sequence_use_merge_profitable_p ()
542 int nelts = full_nelts ().to_constant ();
543 int leading_ndups = this->count_dups (0, nelts - 1, 1);
544 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
545 int nregs = riscv_get_v_regno_alignment (int_mode ());
547 if (leading_ndups + trailing_ndups != nelts)
548 return false;
550 /* Leading elements num > 255 which exceeds the maximum value
551 of QImode, we will need to use HImode. */
552 machine_mode mode;
553 if (leading_ndups > 255 || nregs > 2)
555 if (!get_vector_mode (HImode, nelts).exists (&mode))
556 return false;
557 /* We will need one more AVL/VL toggling vsetvl instruction. */
558 return leading_ndups > 4 && trailing_ndups > 4;
561 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
562 consume 3 slide instructions. */
563 return leading_ndups > 3 && trailing_ndups > 3;
566 /* Merge the repeating sequence into a single element and return the RTX. */
568 rvv_builder::get_merged_repeating_sequence ()
570 scalar_int_mode mode = Pmode;
571 rtx target = gen_reg_rtx (mode);
572 emit_move_insn (target, const0_rtx);
573 rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
574 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
575 for (unsigned int i = 0; i < npatterns (); i++)
577 unsigned int loc = m_inner_bits_size * i;
578 rtx shift = gen_int_mode (loc, mode);
579 rtx ele = gen_lowpart (mode, elt (i));
580 rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
581 OPTAB_DIRECT);
582 rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
583 OPTAB_DIRECT);
584 rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
585 OPTAB_DIRECT);
586 emit_move_insn (target, tmp3);
588 if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
589 return gen_lowpart (m_new_inner_mode, target);
590 return target;
593 /* Get the mask for merge approach.
595 Consider such following case:
596 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
597 To merge "a", the mask should be 1010....
598 To merge "b", the mask should be 0101....
601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
602 machine_mode inner_mode) const
604 unsigned HOST_WIDE_INT mask = 0;
605 unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
606 /* Here we construct a mask pattern that will later be broadcast
607 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
608 is determined by the length of a vector element (ELEN) and not by
609 XLEN so make sure we do not exceed it. One example is -march=zve32*
610 which mandates ELEN == 32 but can be combined with -march=rv64
611 with XLEN == 64. */
612 unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
614 gcc_assert (elen % npatterns () == 0);
616 int limit = elen / npatterns ();
618 for (int i = 0; i < limit; i++)
619 mask |= base_mask << (i * npatterns ());
621 return gen_int_mode (mask, inner_mode);
624 /* Return true if the variable-length vector is single step.
625 Single step means step all patterns in NPATTERNS are equal.
626 Consider this following case:
628 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
629 { 0, 2, 2, 4, 4, 6, ... }
630 First pattern: step1 = 2 - 0 = 2
631 step2 = 4 - 2 = 2
632 Second pattern: step1 = 4 - 2 = 2
633 step2 = 6 - 4 = 2
634 Since all steps of NPATTERNS are equal step = 2.
635 Return true in this case.
637 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
638 { 0, 1, 2, 4, 4, 7, ... }
639 First pattern: step1 = 2 - 0 = 2
640 step2 = 4 - 2 = 2
641 Second pattern: step1 = 4 - 1 = 3
642 step2 = 7 - 4 = 3
643 Since not all steps are equal, return false. */
644 bool
645 rvv_builder::single_step_npatterns_p () const
647 if (nelts_per_pattern () != 3)
648 return false;
650 poly_int64 step
651 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
652 for (unsigned int i = 0; i < npatterns (); i++)
654 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
655 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
656 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
657 poly_int64 diff1 = ele1 - ele0;
658 poly_int64 diff2 = ele2 - ele1;
659 if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
660 return false;
662 return true;
665 /* Return true if the diff between const vector and vid sequence
666 is repeated. For example as below cases:
667 The diff means the const vector - vid.
668 CASE 1:
669 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
670 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
671 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
672 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
673 return TRUE for case 1.
675 CASE 2:
676 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
677 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
678 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
679 The diff sequence {-4, 3} is not repeated in the npattern and
680 return FALSE for case 2. */
681 bool
682 rvv_builder::npatterns_vid_diff_repeated_p () const
684 if (nelts_per_pattern () != 3)
685 return false;
686 else if (npatterns () == 0)
687 return false;
689 for (unsigned i = 0; i < npatterns (); i++)
691 poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
692 poly_int64 diff_1
693 = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
695 if (maybe_ne (diff_0, diff_1))
696 return false;
699 return true;
702 /* Return true if the permutation consists of two
703 interleaved patterns with a constant step each.
704 TODO: We currently only support NPATTERNS = 2. */
705 bool
706 rvv_builder::interleaved_stepped_npatterns_p () const
708 if (npatterns () != 2 || nelts_per_pattern () != 3)
709 return false;
710 for (unsigned int i = 0; i < npatterns (); i++)
712 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
713 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
714 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
715 poly_int64 diff1 = ele1 - ele0;
716 poly_int64 diff2 = ele2 - ele1;
717 if (maybe_ne (diff1, diff2))
718 return false;
720 return true;
723 /* Return true if all elements of NPATTERNS are equal.
725 E.g. NPATTERNS = 4:
726 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
727 E.g. NPATTERNS = 8:
728 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
729 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
730 We don't need to check the elements[n] with n >= NPATTERNS since
731 they don't belong to the same pattern.
733 bool
734 rvv_builder::npatterns_all_equal_p () const
736 poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
737 for (unsigned int i = 1; i < npatterns (); i++)
739 poly_int64 ele = rtx_to_poly_int64 (elt (i));
740 if (!known_eq (ele, ele0))
741 return false;
743 return true;
746 static unsigned
747 get_sew (machine_mode mode)
749 unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
751 : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
752 return sew;
755 /* Return true if X is a const_vector with all duplicate elements, which is in
756 the range between MINVAL and MAXVAL. */
757 bool
758 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
759 HOST_WIDE_INT maxval)
761 rtx elt;
762 return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
763 && IN_RANGE (INTVAL (elt), minval, maxval));
766 /* Return true if VEC is a constant in which every element is in the range
767 [MINVAL, MAXVAL]. The elements do not need to have the same value.
769 This function also exists in aarch64, we may unify it in middle-end in the
770 future. */
772 static bool
773 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
775 if (!CONST_VECTOR_P (vec)
776 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
777 return false;
779 int nunits;
780 if (!CONST_VECTOR_STEPPED_P (vec))
781 nunits = const_vector_encoded_nelts (vec);
782 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
783 return false;
785 for (int i = 0; i < nunits; i++)
787 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
788 poly_int64 value;
789 if (!poly_int_rtx_p (vec_elem, &value)
790 || maybe_lt (value, minval)
791 || maybe_gt (value, maxval))
792 return false;
794 return true;
797 /* Returns true if the vector's elements are all duplicates in
798 range -16 ~ 15 integer or 0.0 floating-point. */
800 bool
801 valid_vec_immediate_p (rtx x)
803 return (satisfies_constraint_vi (x) || satisfies_constraint_Wc0 (x));
806 /* Return a const vector of VAL. The VAL can be either const_int or
807 const_poly_int. */
809 static rtx
810 gen_const_vector_dup (machine_mode mode, poly_int64 val)
812 scalar_mode smode = GET_MODE_INNER (mode);
813 rtx c = gen_int_mode (val, smode);
814 if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
816 /* When VAL is const_poly_int value, we need to explicitly broadcast
817 it into a vector using RVV broadcast instruction. */
818 return expand_vector_broadcast (mode, c);
820 return gen_const_vec_duplicate (mode, c);
823 /* Emit a vlmax vsetvl instruction. This should only be used when
824 optimization is disabled or after vsetvl insertion pass. */
825 void
826 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
828 unsigned int sew = get_sew (vmode);
829 emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
830 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
831 const0_rtx));
834 void
835 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
837 unsigned int sew = get_sew (vmode);
838 enum vlmul_type vlmul = get_vlmul (vmode);
839 unsigned int ratio = calculate_ratio (sew, vlmul);
841 if (!optimize)
842 emit_hard_vlmax_vsetvl (vmode, vl);
843 else
844 emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
847 /* Calculate SEW/LMUL ratio. */
848 unsigned int
849 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
851 unsigned int ratio;
852 switch (vlmul)
854 case LMUL_1:
855 ratio = sew;
856 break;
857 case LMUL_2:
858 ratio = sew / 2;
859 break;
860 case LMUL_4:
861 ratio = sew / 4;
862 break;
863 case LMUL_8:
864 ratio = sew / 8;
865 break;
866 case LMUL_F8:
867 ratio = sew * 8;
868 break;
869 case LMUL_F4:
870 ratio = sew * 4;
871 break;
872 case LMUL_F2:
873 ratio = sew * 2;
874 break;
875 default:
876 gcc_unreachable ();
878 return ratio;
881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
882 compile-time unknown). ZVL means that the vector-length is specific
883 (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
884 auto-vectorization using VLMAX vsetvl configuration. */
885 static bool
886 autovec_use_vlmax_p (void)
888 return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE
889 || rvv_vector_bits == RVV_VECTOR_BITS_ZVL;
892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
893 is a const duplicate vector. Otherwise, emit vrgather.vv. */
894 static void
895 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
897 rtx elt;
898 insn_code icode;
899 machine_mode data_mode = GET_MODE (target);
900 machine_mode sel_mode = GET_MODE (sel);
901 if (const_vec_duplicate_p (sel, &elt))
903 icode = code_for_pred_gather_scalar (data_mode);
904 sel = elt;
906 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
907 icode = code_for_pred_gatherei16 (data_mode);
908 else
909 icode = code_for_pred_gather (data_mode);
910 rtx ops[] = {target, op, sel};
911 emit_vlmax_insn (icode, BINARY_OP, ops);
914 static void
915 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
917 rtx elt;
918 insn_code icode;
919 machine_mode data_mode = GET_MODE (target);
920 machine_mode sel_mode = GET_MODE (sel);
921 if (const_vec_duplicate_p (sel, &elt))
923 icode = code_for_pred_gather_scalar (data_mode);
924 sel = elt;
926 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
927 icode = code_for_pred_gatherei16 (data_mode);
928 else
929 icode = code_for_pred_gather (data_mode);
930 rtx ops[] = {target, mask, target, op, sel};
931 emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
935 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
937 There is no inverse vdecompress provided, as this operation can be readily
938 synthesized using iota and a masked vrgather:
940 Desired functionality of 'vdecompress'
941 7 6 5 4 3 2 1 0 # vid
943 e d c b a # packed vector of 5 elements
944 1 0 0 1 1 1 0 1 # mask vector of 8 elements
945 p q r s t u v w # destination register before vdecompress
947 e q r d c b v a # result of vdecompress
948 # v0 holds mask
949 # v1 holds packed data
950 # v11 holds input expanded vector and result
951 viota.m v10, v0 # Calc iota from mask in v0
952 vrgather.vv v11, v1, v10, v0.t # Expand into destination
953 p q r s t u v w # v11 destination register
954 e d c b a # v1 source vector
955 1 0 0 1 1 1 0 1 # v0 mask vector
957 4 4 4 3 2 1 1 0 # v10 result of viota.m
958 e q r d c b v a # v11 destination after vrgather using viota.m under mask
960 static void
961 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
963 machine_mode data_mode = GET_MODE (target);
964 machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
965 if (GET_MODE_INNER (data_mode) == QImode)
966 sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
968 rtx sel = gen_reg_rtx (sel_mode);
969 rtx iota_ops[] = {sel, mask};
970 emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
971 emit_vlmax_gather_insn (target, op0, sel);
972 emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
975 /* Emit merge instruction. */
977 static machine_mode
978 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
979 machine_mode mask_bit_mode)
981 unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
982 unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
983 ? builder.inner_bits_size () : mask_precision;
985 scalar_mode inner_mode;
986 unsigned minimal_bits_size;
988 switch (mask_scalar_size)
990 case 8:
991 inner_mode = QImode;
992 minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */
993 break;
994 case 16:
995 inner_mode = HImode;
996 minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */
997 break;
998 case 32:
999 inner_mode = SImode;
1000 minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */
1001 break;
1002 case 64:
1003 inner_mode = DImode;
1004 minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */
1005 break;
1006 default:
1007 gcc_unreachable ();
1008 break;
1011 gcc_assert (mask_precision % mask_scalar_size == 0);
1013 uint64_t dup_nunit = mask_precision > mask_scalar_size
1014 ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1016 return get_vector_mode (inner_mode, dup_nunit).require ();
1019 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1020 instructions to generate sequence for VID:
1022 VID = { 0, 1, 2, 3, ... }
1024 Otherwise, we use the VID argument directly. */
1026 void
1027 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1029 machine_mode mode = GET_MODE (dest);
1030 poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1031 poly_int64 value;
1032 rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1034 /* VECT_IV = BASE + I * STEP. */
1036 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1037 bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1038 && poly_int_rtx_p (base, &value)
1039 && known_eq (nunits_m1, value);
1040 if (!vid)
1042 vid = gen_reg_rtx (mode);
1043 rtx op[] = {vid};
1044 emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1047 rtx step_adj;
1048 if (reverse_p)
1050 /* Special case:
1051 {nunits - 1, nunits - 2, ... , 0}.
1052 nunits can be either const_int or const_poly_int.
1054 Code sequence:
1055 vid.v v
1056 vrsub nunits - 1, v. */
1057 rtx ops[]
1058 = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1059 insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1060 emit_vlmax_insn (icode, BINARY_OP, ops);
1062 else
1064 /* Step 2: Generate I * STEP.
1065 - STEP is 1, we don't emit any instructions.
1066 - STEP is power of 2, we use vsll.vi/vsll.vx.
1067 - STEP is non-power of 2, we use vmul.vx. */
1068 if (rtx_equal_p (step, const1_rtx))
1069 step_adj = vid;
1070 else
1072 step_adj = gen_reg_rtx (mode);
1073 if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1075 /* Emit logical left shift operation. */
1076 int shift = exact_log2 (INTVAL (step));
1077 rtx shift_amount = gen_int_mode (shift, Pmode);
1078 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1079 rtx ops[] = {step_adj, vid, shift_amount};
1080 emit_vlmax_insn (icode, BINARY_OP, ops);
1082 else
1084 insn_code icode = code_for_pred_scalar (MULT, mode);
1085 rtx ops[] = {step_adj, vid, step};
1086 emit_vlmax_insn (icode, BINARY_OP, ops);
1090 /* Step 3: Generate BASE + I * STEP.
1091 - BASE is 0, use result of vid.
1092 - BASE is not 0, we use vadd.vx/vadd.vi. */
1093 if (rtx_equal_p (base, const0_rtx))
1094 emit_move_insn (result, step_adj);
1095 else
1097 insn_code icode = code_for_pred_scalar (PLUS, mode);
1098 rtx ops[] = {result, step_adj, base};
1099 emit_vlmax_insn (icode, BINARY_OP, ops);
1103 if (result != dest)
1104 emit_move_insn (dest, result);
1107 /* Subroutine of riscv_vector_expand_vector_init.
1108 Works as follows:
1109 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110 (b) Skip leading elements from BUILDER, which are the same as
1111 element NELTS_REQD - 1.
1112 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
1114 static void
1115 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
1116 int nelts_reqd)
1118 machine_mode mode = GET_MODE (target);
1119 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
1120 emit_move_insn (target, dup);
1121 int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1122 for (int i = ndups; i < nelts_reqd; i++)
1124 unsigned int unspec
1125 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
1126 insn_code icode = code_for_pred_slide (unspec, mode);
1127 rtx ops[] = {target, target, builder.elt (i)};
1128 emit_vlmax_insn (icode, BINARY_OP, ops);
1132 /* Subroutine of expand_vec_init to handle case
1133 when all trailing elements of builder are same.
1134 This works as follows:
1135 (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136 (b) Insert remaining elements in TARGET using insr.
1138 ??? The heuristic used is to do above if number of same trailing elements
1139 is greater than leading_ndups, loosely based on
1140 heuristic from mostly_zeros_p. May need fine-tuning. */
1142 static bool
1143 expand_vector_init_trailing_same_elem (rtx target,
1144 const rtx_vector_builder &builder,
1145 int nelts_reqd)
1147 int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1148 int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
1149 machine_mode mode = GET_MODE (target);
1151 if (trailing_ndups > leading_ndups)
1153 rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
1154 for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
1156 unsigned int unspec
1157 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
1158 insn_code icode = code_for_pred_slide (unspec, mode);
1159 rtx tmp = gen_reg_rtx (mode);
1160 rtx ops[] = {tmp, dup, builder.elt (i)};
1161 emit_vlmax_insn (icode, BINARY_OP, ops);
1162 /* slide1up need source and dest to be different REG. */
1163 dup = tmp;
1166 emit_move_insn (target, dup);
1167 return true;
1170 return false;
1173 static void
1174 expand_const_vector (rtx target, rtx src)
1176 machine_mode mode = GET_MODE (target);
1177 rtx result = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1178 rtx elt;
1179 if (const_vec_duplicate_p (src, &elt))
1181 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1183 gcc_assert (rtx_equal_p (elt, const0_rtx)
1184 || rtx_equal_p (elt, const1_rtx));
1185 rtx ops[] = {result, src};
1186 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1188 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189 we use vmv.v.i instruction. */
1190 else if (valid_vec_immediate_p (src))
1192 rtx ops[] = {result, src};
1193 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1195 else
1197 /* Emit vec_duplicate<mode> split pattern before RA so that
1198 we could have a better optimization opportunity in LICM
1199 which will hoist vmv.v.x outside the loop and in fwprop && combine
1200 which will transform 'vv' into 'vx' instruction.
1202 The reason we don't emit vec_duplicate<mode> split pattern during
1203 RA since the split stage after RA is a too late stage to generate
1204 RVV instruction which need an additional register (We can't
1205 allocate a new register after RA) for VL operand of vsetvl
1206 instruction (vsetvl a5, zero). */
1207 if (lra_in_progress)
1209 rtx ops[] = {result, elt};
1210 emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1212 else
1214 struct expand_operand ops[2];
1215 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1216 gcc_assert (icode != CODE_FOR_nothing);
1217 create_output_operand (&ops[0], result, mode);
1218 create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1219 expand_insn (icode, 2, ops);
1220 result = ops[0].value;
1224 if (result != target)
1225 emit_move_insn (target, result);
1226 return;
1229 /* Support scalable const series vector. */
1230 rtx base, step;
1231 if (const_vec_series_p (src, &base, &step))
1233 expand_vec_series (result, base, step);
1235 if (result != target)
1236 emit_move_insn (target, result);
1237 return;
1240 /* Handle variable-length vector. */
1241 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1242 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1243 rvv_builder builder (mode, npatterns, nelts_per_pattern);
1244 for (unsigned int i = 0; i < nelts_per_pattern; i++)
1246 for (unsigned int j = 0; j < npatterns; j++)
1247 builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1249 builder.finalize ();
1251 if (CONST_VECTOR_DUPLICATE_P (src))
1253 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256 The elements within NPATTERNS are not necessary regular. */
1257 if (builder.can_duplicate_repeating_sequence_p ())
1259 /* We handle the case that we can find a vector container to hold
1260 element bitsize = NPATTERNS * ele_bitsize.
1262 NPATTERNS = 8, element width = 8
1263 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264 In this case, we can combine NPATTERNS element into a larger
1265 element. Use element width = 64 and broadcast a vector with
1266 all element equal to 0x0706050403020100. */
1267 rtx ele = builder.get_merged_repeating_sequence ();
1268 rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1269 emit_move_insn (result, gen_lowpart (mode, dup));
1271 else
1273 /* We handle the case that we can't find a vector container to hold
1274 element bitsize = NPATTERNS * ele_bitsize.
1276 NPATTERNS = 8, element width = 16
1277 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278 Since NPATTERNS * element width = 128, we can't find a container
1279 to hold it.
1281 In this case, we use NPATTERNS merge operations to generate such
1282 vector. */
1283 unsigned int nbits = npatterns - 1;
1285 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1286 rtx vid = gen_reg_rtx (builder.int_mode ());
1287 rtx op[] = {vid};
1288 emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1289 NULLARY_OP, op);
1291 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1292 rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1293 rtx and_ops[] = {vid_repeat, vid,
1294 gen_int_mode (nbits, builder.inner_int_mode ())};
1295 emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1296 BINARY_OP, and_ops);
1298 rtx tmp1 = gen_reg_rtx (builder.mode ());
1299 rtx dup_ops[] = {tmp1, builder.elt (0)};
1300 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1301 dup_ops);
1302 for (unsigned int i = 1; i < builder.npatterns (); i++)
1304 /* Generate mask according to i. */
1305 rtx mask = gen_reg_rtx (builder.mask_mode ());
1306 rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1307 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1309 /* Merge scalar to each i. */
1310 rtx tmp2 = gen_reg_rtx (builder.mode ());
1311 rtx merge_ops[] = {tmp2, tmp1, builder.elt (i), mask};
1312 insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1313 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1314 tmp1 = tmp2;
1316 emit_move_insn (result, tmp1);
1319 else if (CONST_VECTOR_STEPPED_P (src))
1321 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1322 if (builder.single_step_npatterns_p ())
1324 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1325 insn_code icode;
1327 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1328 rtx vid = gen_reg_rtx (builder.mode ());
1329 rtx vid_ops[] = {vid};
1330 icode = code_for_pred_series (builder.mode ());
1331 emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1333 if (builder.npatterns_all_equal_p ())
1335 /* Generate the variable-length vector following this rule:
1336 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1339 /* We want to create a pattern where value[idx] = floor (idx /
1340 NPATTERNS). As NPATTERNS is always a power of two we can
1341 rewrite this as = idx & -NPATTERNS. */
1342 /* Step 2: VID AND -NPATTERNS:
1343 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1345 rtx imm
1346 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1347 rtx tmp1 = gen_reg_rtx (builder.mode ());
1348 rtx and_ops[] = {tmp1, vid, imm};
1349 icode = code_for_pred_scalar (AND, builder.mode ());
1350 emit_vlmax_insn (icode, BINARY_OP, and_ops);
1352 /* Step 3: Convert to step size 1. */
1353 rtx tmp2 = gen_reg_rtx (builder.mode ());
1354 /* log2 (npatterns) to get the shift amount to convert
1355 Eg. { 0, 0, 0, 0, 4, 4, ... }
1356 into { 0, 0, 0, 0, 1, 1, ... }. */
1357 HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ;
1358 rtx shift = gen_int_mode (shift_amt, builder.inner_mode ());
1359 rtx shift_ops[] = {tmp2, tmp1, shift};
1360 icode = code_for_pred_scalar (ASHIFTRT, builder.mode ());
1361 emit_vlmax_insn (icode, BINARY_OP, shift_ops);
1363 /* Step 4: Multiply to step size n. */
1364 HOST_WIDE_INT step_size =
1365 INTVAL (builder.elt (builder.npatterns ()))
1366 - INTVAL (builder.elt (0));
1367 rtx tmp3 = gen_reg_rtx (builder.mode ());
1368 if (pow2p_hwi (step_size))
1370 /* Power of 2 can be handled with a left shift. */
1371 HOST_WIDE_INT shift = exact_log2 (step_size);
1372 rtx shift_amount = gen_int_mode (shift, Pmode);
1373 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1374 rtx ops[] = {tmp3, tmp2, shift_amount};
1375 emit_vlmax_insn (icode, BINARY_OP, ops);
1377 else
1379 rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ());
1380 insn_code icode = code_for_pred_scalar (MULT, builder.mode ());
1381 rtx ops[] = {tmp3, tmp2, mult_amt};
1382 emit_vlmax_insn (icode, BINARY_OP, ops);
1385 /* Step 5: Add starting value to all elements. */
1386 HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1387 if (init_val == 0)
1388 emit_move_insn (result, tmp3);
1389 else
1391 rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1392 rtx add_ops[] = {result, tmp3, dup};
1393 icode = code_for_pred (PLUS, builder.mode ());
1394 emit_vlmax_insn (icode, BINARY_OP, add_ops);
1397 else
1399 /* Generate the variable-length vector following this rule:
1400 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1402 if (builder.npatterns_vid_diff_repeated_p ())
1404 /* Case 1: For example as below:
1405 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407 repeated as below after minus vid.
1408 {3, 1, -1, -3, 3, 1, -1, -3...}
1409 Then we can simplify the diff code gen to at most
1410 npatterns(). */
1411 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1413 /* Step 1: Generate diff = TARGET - VID. */
1414 for (unsigned int i = 0; i < v.npatterns (); ++i)
1416 poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1417 v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1420 /* Step 2: Generate result = VID + diff. */
1421 rtx vec = v.build ();
1422 rtx add_ops[] = {result, vid, vec};
1423 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1424 BINARY_OP, add_ops);
1426 else
1428 /* Case 2: For example as below:
1429 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1431 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1433 /* Step 1: Generate { a, b, a, b, ... } */
1434 for (unsigned int i = 0; i < v.npatterns (); ++i)
1435 v.quick_push (builder.elt (i));
1436 rtx new_base = v.build ();
1438 /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS).  */
1439 rtx shift_count
1440 = gen_int_mode (exact_log2 (builder.npatterns ()),
1441 builder.inner_mode ());
1442 rtx tmp1 = gen_reg_rtx (builder.mode ());
1443 rtx shift_ops[] = {tmp1, vid, shift_count};
1444 emit_vlmax_insn (code_for_pred_scalar
1445 (LSHIFTRT, builder.mode ()), BINARY_OP,
1446 shift_ops);
1448 /* Step 3: Generate tmp2 = tmp1 * step.  */
1449 rtx tmp2 = gen_reg_rtx (builder.mode ());
1450 rtx step
1451 = simplify_binary_operation (MINUS, builder.inner_mode (),
1452 builder.elt (v.npatterns()),
1453 builder.elt (0));
1454 expand_vec_series (tmp2, const0_rtx, step, tmp1);
1456 /* Step 4: Generate result = tmp2 + new_base.  */
1457 rtx add_ops[] = {result, tmp2, new_base};
1458 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1459 BINARY_OP, add_ops);
1463 else if (builder.interleaved_stepped_npatterns_p ())
1465 rtx base1 = builder.elt (0);
1466 rtx base2 = builder.elt (1);
1467 poly_int64 step1
1468 = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1469 - rtx_to_poly_int64 (base1);
1470 poly_int64 step2
1471 = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1472 - rtx_to_poly_int64 (base2);
1474 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1475 integer vector mode to generate such vector efficiently.
1477 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1479 can be interpreted into:
1481 EEW = 32, { 2, 4, ... }.
1483 This only works as long as the larger type does not overflow
1484 as we can't guarantee a zero value for each second element
1485 of the sequence with smaller EEW.
1486 ??? For now we assume that no overflow happens with positive
1487 steps and forbid negative steps altogether. */
1488 unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1489 scalar_int_mode new_smode;
1490 machine_mode new_mode;
1491 poly_uint64 new_nunits
1492 = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1493 if (known_ge (step1, 0) && known_ge (step2, 0)
1494 && int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1495 && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1497 rtx tmp1 = gen_reg_rtx (new_mode);
1498 base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1499 expand_vec_series (tmp1, base1, gen_int_mode (step1, new_smode));
1501 if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1502 /* { 1, 0, 2, 0, ... }. */
1503 emit_move_insn (result, gen_lowpart (mode, tmp1));
1504 else if (known_eq (step2, 0))
1506 /* { 1, 1, 2, 1, ... }. */
1507 rtx scalar = expand_simple_binop (
1508 Xmode, ASHIFT,
1509 gen_int_mode (rtx_to_poly_int64 (base2), Xmode),
1510 gen_int_mode (builder.inner_bits_size (), Xmode),
1511 NULL_RTX, false, OPTAB_DIRECT);
1512 scalar = simplify_gen_subreg (new_smode, scalar, Xmode, 0);
1513 rtx tmp2 = gen_reg_rtx (new_mode);
1514 rtx ior_ops[] = {tmp2, tmp1, scalar};
1515 emit_vlmax_insn (code_for_pred_scalar (IOR, new_mode),
1516 BINARY_OP, ior_ops);
1517 emit_move_insn (result, gen_lowpart (mode, tmp2));
1519 else
1521 /* { 1, 3, 2, 6, ... }. */
1522 rtx tmp2 = gen_reg_rtx (new_mode);
1523 base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1524 expand_vec_series (tmp2, base2,
1525 gen_int_mode (step2, new_smode));
1526 rtx shifted_tmp2 = expand_simple_binop (
1527 new_mode, ASHIFT, tmp2,
1528 gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1529 false, OPTAB_DIRECT);
1530 rtx tmp3 = gen_reg_rtx (new_mode);
1531 rtx ior_ops[] = {tmp3, tmp1, shifted_tmp2};
1532 emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1533 ior_ops);
1534 emit_move_insn (result, gen_lowpart (mode, tmp3));
1537 else
1539 rtx vid = gen_reg_rtx (mode);
1540 expand_vec_series (vid, const0_rtx, const1_rtx);
1541 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1542 rtx shifted_vid
1543 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1544 NULL_RTX, false, OPTAB_DIRECT);
1545 rtx tmp1 = gen_reg_rtx (mode);
1546 rtx tmp2 = gen_reg_rtx (mode);
1547 expand_vec_series (tmp1, base1,
1548 gen_int_mode (step1, builder.inner_mode ()),
1549 shifted_vid);
1550 expand_vec_series (tmp2, base2,
1551 gen_int_mode (step2, builder.inner_mode ()),
1552 shifted_vid);
1554 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1555 rtx and_vid = gen_reg_rtx (mode);
1556 rtx and_ops[] = {and_vid, vid, const1_rtx};
1557 emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1558 and_ops);
1559 rtx mask = gen_reg_rtx (builder.mask_mode ());
1560 expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1562 rtx ops[] = {result, tmp1, tmp2, mask};
1563 emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1566 else
1567 /* TODO: We will enable more variable-length vector in the future. */
1568 gcc_unreachable ();
1570 else
1571 gcc_unreachable ();
1573 if (result != target)
1574 emit_move_insn (target, result);
1577 /* Get the frm mode with given CONST_INT rtx, the default mode is
1578 FRM_DYN. */
1579 enum floating_point_rounding_mode
1580 get_frm_mode (rtx operand)
1582 gcc_assert (CONST_INT_P (operand));
1584 switch (INTVAL (operand))
1586 case FRM_RNE:
1587 return FRM_RNE;
1588 case FRM_RTZ:
1589 return FRM_RTZ;
1590 case FRM_RDN:
1591 return FRM_RDN;
1592 case FRM_RUP:
1593 return FRM_RUP;
1594 case FRM_RMM:
1595 return FRM_RMM;
1596 case FRM_DYN:
1597 return FRM_DYN;
1598 default:
1599 gcc_unreachable ();
1602 gcc_unreachable ();
1605 /* Expand a pre-RA RVV data move from SRC to DEST.
1606 It expands move for RVV fractional vector modes.
1607 Return true if the move as already been emitted. */
1608 bool
1609 legitimize_move (rtx dest, rtx *srcp)
1611 rtx src = *srcp;
1612 machine_mode mode = GET_MODE (dest);
1613 if (CONST_VECTOR_P (src))
1615 expand_const_vector (dest, src);
1616 return true;
1619 if (riscv_v_ext_vls_mode_p (mode))
1621 if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1623 /* For NUNITS <= 31 VLS modes, we don't need extract
1624 scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1625 if (can_create_pseudo_p ())
1627 /* Need to force register if mem <- !reg. */
1628 if (MEM_P (dest) && !REG_P (src))
1629 *srcp = force_reg (mode, src);
1631 return false;
1634 else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1636 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1637 return true;
1640 else
1642 /* In order to decrease the memory traffic, we don't use whole register
1643 * load/store for the LMUL less than 1 and mask mode, so those case will
1644 * require one extra general purpose register, but it's not allowed during
1645 * LRA process, so we have a special move pattern used for LRA, which will
1646 * defer the expansion after LRA. */
1647 if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1648 || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1649 && lra_in_progress)
1651 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1652 return true;
1655 if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1656 && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1658 /* Need to force register if mem <- !reg. */
1659 if (MEM_P (dest) && !REG_P (src))
1660 *srcp = force_reg (mode, src);
1662 return false;
1666 if (register_operand (src, mode) && register_operand (dest, mode))
1668 emit_insn (gen_rtx_SET (dest, src));
1669 return true;
1672 unsigned insn_flags
1673 = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1674 if (!register_operand (src, mode) && !register_operand (dest, mode))
1676 rtx tmp = gen_reg_rtx (mode);
1677 if (MEM_P (src))
1679 rtx ops[] = {tmp, src};
1680 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1682 else
1683 emit_move_insn (tmp, src);
1684 src = tmp;
1687 if (satisfies_constraint_vu (src))
1688 return false;
1690 rtx ops[] = {dest, src};
1691 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1692 return true;
1695 /* VTYPE information for machine_mode. */
1696 struct mode_vtype_group
1698 enum vlmul_type vlmul[NUM_MACHINE_MODES];
1699 uint8_t ratio[NUM_MACHINE_MODES];
1700 machine_mode subpart_mode[NUM_MACHINE_MODES];
1701 uint8_t nf[NUM_MACHINE_MODES];
1702 mode_vtype_group ()
1704 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1705 vlmul[MODE##mode] = VLMUL; \
1706 ratio[MODE##mode] = RATIO;
1707 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1708 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1709 nf[MODE##mode] = NF; \
1710 vlmul[MODE##mode] = VLMUL; \
1711 ratio[MODE##mode] = RATIO;
1712 #include "riscv-vector-switch.def"
1713 #undef ENTRY
1714 #undef TUPLE_ENTRY
1718 static mode_vtype_group mode_vtype_infos;
1720 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1721 enum vlmul_type
1722 get_vlmul (machine_mode mode)
1724 /* For VLS modes, the vlmul should be dynamically
1725 calculated since we need to adjust VLMUL according
1726 to TARGET_MIN_VLEN. */
1727 if (riscv_v_ext_vls_mode_p (mode))
1729 int size = GET_MODE_BITSIZE (mode).to_constant ();
1730 int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1731 if (size < TARGET_MIN_VLEN)
1733 int factor = TARGET_MIN_VLEN / size;
1734 if (inner_size == 8)
1735 factor = MIN (factor, 8);
1736 else if (inner_size == 16)
1737 factor = MIN (factor, 4);
1738 else if (inner_size == 32)
1739 factor = MIN (factor, 2);
1740 else if (inner_size == 64)
1741 factor = MIN (factor, 1);
1742 else
1743 gcc_unreachable ();
1745 switch (factor)
1747 case 1:
1748 return LMUL_1;
1749 case 2:
1750 return LMUL_F2;
1751 case 4:
1752 return LMUL_F4;
1753 case 8:
1754 return LMUL_F8;
1756 default:
1757 gcc_unreachable ();
1760 else
1762 int factor = size / TARGET_MIN_VLEN;
1763 switch (factor)
1765 case 1:
1766 return LMUL_1;
1767 case 2:
1768 return LMUL_2;
1769 case 4:
1770 return LMUL_4;
1771 case 8:
1772 return LMUL_8;
1774 default:
1775 gcc_unreachable ();
1779 return mode_vtype_infos.vlmul[mode];
1782 /* Return the VLMAX rtx of vector mode MODE. */
1784 get_vlmax_rtx (machine_mode mode)
1786 gcc_assert (riscv_v_ext_vector_mode_p (mode));
1787 return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1790 /* Return the NF value of the corresponding mode. */
1791 unsigned int
1792 get_nf (machine_mode mode)
1794 /* We don't allow non-tuple modes go through this function. */
1795 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1796 return mode_vtype_infos.nf[mode];
1799 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1800 the subpart mode is RVVM2SImode. This will help to build
1801 array/struct type in builtins. */
1802 machine_mode
1803 get_subpart_mode (machine_mode mode)
1805 /* We don't allow non-tuple modes go through this function. */
1806 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1807 return mode_vtype_infos.subpart_mode[mode];
1810 /* Get ratio according to machine mode. */
1811 unsigned int
1812 get_ratio (machine_mode mode)
1814 if (riscv_v_ext_vls_mode_p (mode))
1816 unsigned int sew = get_sew (mode);
1817 vlmul_type vlmul = get_vlmul (mode);
1818 switch (vlmul)
1820 case LMUL_1:
1821 return sew;
1822 case LMUL_2:
1823 return sew / 2;
1824 case LMUL_4:
1825 return sew / 4;
1826 case LMUL_8:
1827 return sew / 8;
1828 case LMUL_F8:
1829 return sew * 8;
1830 case LMUL_F4:
1831 return sew * 4;
1832 case LMUL_F2:
1833 return sew * 2;
1835 default:
1836 gcc_unreachable ();
1839 return mode_vtype_infos.ratio[mode];
1842 /* Get ta according to operand[tail_op_idx]. */
1844 get_ta (rtx ta)
1846 if (INTVAL (ta) == TAIL_ANY)
1847 return INVALID_ATTRIBUTE;
1848 return INTVAL (ta);
1851 /* Get ma according to operand[mask_op_idx]. */
1853 get_ma (rtx ma)
1855 if (INTVAL (ma) == MASK_ANY)
1856 return INVALID_ATTRIBUTE;
1857 return INTVAL (ma);
1860 /* Get prefer tail policy. */
1861 enum tail_policy
1862 get_prefer_tail_policy ()
1864 /* TODO: By default, we choose to use TAIL_ANY which allows
1865 compiler pick up either agnostic or undisturbed. Maybe we
1866 will have a compile option like -mprefer=agnostic to set
1867 this value???. */
1868 return TAIL_ANY;
1871 /* Get prefer mask policy. */
1872 enum mask_policy
1873 get_prefer_mask_policy ()
1875 /* TODO: By default, we choose to use MASK_ANY which allows
1876 compiler pick up either agnostic or undisturbed. Maybe we
1877 will have a compile option like -mprefer=agnostic to set
1878 this value???. */
1879 return MASK_ANY;
1882 /* Get avl_type rtx. */
1884 get_avl_type_rtx (enum avl_type type)
1886 return gen_int_mode (type, Pmode);
1889 /* Return the appropriate mask mode for MODE. */
1891 machine_mode
1892 get_mask_mode (machine_mode mode)
1894 poly_int64 nunits = GET_MODE_NUNITS (mode);
1895 if (riscv_v_ext_tuple_mode_p (mode))
1897 unsigned int nf = get_nf (mode);
1898 nunits = exact_div (nunits, nf);
1900 return get_vector_mode (BImode, nunits).require ();
1903 /* Return the appropriate LMUL mode for MODE. */
1905 opt_machine_mode
1906 get_lmul_mode (scalar_mode mode, int lmul)
1908 poly_uint64 lmul_nunits;
1909 unsigned int bytes = GET_MODE_SIZE (mode);
1910 if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits))
1911 return get_vector_mode (mode, lmul_nunits);
1912 return E_VOIDmode;
1915 /* Return the appropriate M1 mode for MODE. */
1917 static opt_machine_mode
1918 get_m1_mode (machine_mode mode)
1920 scalar_mode smode = GET_MODE_INNER (mode);
1921 unsigned int bytes = GET_MODE_SIZE (smode);
1922 poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1923 return get_vector_mode (smode, m1_nunits);
1926 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1927 This function is not only used by builtins, but also will be used by
1928 auto-vectorization in the future. */
1929 opt_machine_mode
1930 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1932 enum mode_class mclass;
1933 if (inner_mode == E_BImode)
1934 mclass = MODE_VECTOR_BOOL;
1935 else if (FLOAT_MODE_P (inner_mode))
1936 mclass = MODE_VECTOR_FLOAT;
1937 else
1938 mclass = MODE_VECTOR_INT;
1939 machine_mode mode;
1940 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1941 if (inner_mode == GET_MODE_INNER (mode)
1942 && known_eq (nunits, GET_MODE_NUNITS (mode))
1943 && (riscv_v_ext_vector_mode_p (mode)
1944 || riscv_v_ext_vls_mode_p (mode)))
1945 return mode;
1946 return opt_machine_mode ();
1949 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1950 corresponding subpart mode and NF. */
1951 opt_machine_mode
1952 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1954 poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1955 scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1956 enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1957 machine_mode mode;
1958 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1959 if (inner_mode == GET_MODE_INNER (mode)
1960 && known_eq (nunits, GET_MODE_NUNITS (mode))
1961 && riscv_v_ext_tuple_mode_p (mode)
1962 && get_subpart_mode (mode) == subpart_mode)
1963 return mode;
1964 return opt_machine_mode ();
1967 bool
1968 simm5_p (rtx x)
1970 if (!CONST_INT_P (x))
1971 return false;
1972 return IN_RANGE (INTVAL (x), -16, 15);
1975 bool
1976 neg_simm5_p (rtx x)
1978 if (!CONST_INT_P (x))
1979 return false;
1980 return IN_RANGE (INTVAL (x), -15, 16);
1983 bool
1984 has_vi_variant_p (rtx_code code, rtx x)
1986 switch (code)
1988 case PLUS:
1989 case AND:
1990 case IOR:
1991 case XOR:
1992 case SS_PLUS:
1993 case US_PLUS:
1994 case EQ:
1995 case NE:
1996 case LE:
1997 case LEU:
1998 case GT:
1999 case GTU:
2000 return simm5_p (x);
2002 case LT:
2003 case LTU:
2004 case GE:
2005 case GEU:
2006 case MINUS:
2007 case SS_MINUS:
2008 return neg_simm5_p (x);
2010 default:
2011 return false;
2015 bool
2016 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
2017 machine_mode vector_mode, bool has_vi_variant_p,
2018 void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
2020 machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
2021 if (has_vi_variant_p)
2023 *scalar_op = force_reg (scalar_mode, *scalar_op);
2024 return false;
2027 if (TARGET_64BIT)
2029 if (!rtx_equal_p (*scalar_op, const0_rtx))
2030 *scalar_op = force_reg (scalar_mode, *scalar_op);
2031 return false;
2034 if (immediate_operand (*scalar_op, Pmode))
2036 if (!rtx_equal_p (*scalar_op, const0_rtx))
2037 *scalar_op = force_reg (Pmode, *scalar_op);
2039 *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
2040 return false;
2043 if (CONST_INT_P (*scalar_op))
2045 if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
2046 *scalar_op = force_const_mem (scalar_mode, *scalar_op);
2047 else
2048 *scalar_op = force_reg (scalar_mode, *scalar_op);
2051 rtx tmp = gen_reg_rtx (vector_mode);
2052 rtx ops[] = {tmp, *scalar_op};
2053 if (type == VLMAX)
2054 emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
2055 else
2056 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
2057 vl);
2058 emit_vector_func (operands, tmp);
2060 return true;
2063 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
2065 gen_scalar_move_mask (machine_mode mode)
2067 rtx_vector_builder builder (mode, 1, 2);
2068 builder.quick_push (const1_rtx);
2069 builder.quick_push (const0_rtx);
2070 return builder.build ();
2073 static unsigned
2074 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
2076 // Original equation:
2077 // VLMAX = (VectorBits / EltSize) * LMUL
2078 // where LMUL = MinSize / TARGET_MIN_VLEN
2079 // The following equations have been reordered to prevent loss of precision
2080 // when calculating fractional LMUL.
2081 return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
2084 static unsigned
2085 get_unknown_min_value (machine_mode mode)
2087 enum vlmul_type vlmul = get_vlmul (mode);
2088 switch (vlmul)
2090 case LMUL_1:
2091 return TARGET_MIN_VLEN;
2092 case LMUL_2:
2093 return TARGET_MIN_VLEN * 2;
2094 case LMUL_4:
2095 return TARGET_MIN_VLEN * 4;
2096 case LMUL_8:
2097 return TARGET_MIN_VLEN * 8;
2098 default:
2099 gcc_unreachable ();
2103 static rtx
2104 force_vector_length_operand (rtx vl)
2106 if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2107 return force_reg (Pmode, vl);
2108 return vl;
2112 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2114 unsigned int sew = get_sew (vmode);
2115 rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2116 rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2117 return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2118 gen_int_mode (get_vlmul (vmode), Pmode),
2119 tail_policy, mask_policy);
2122 /* GET VL * 2 rtx. */
2123 static rtx
2124 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2126 rtx i32vl = NULL_RTX;
2127 if (CONST_INT_P (avl))
2129 unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2130 unsigned min_size = get_unknown_min_value (mode);
2131 unsigned vlen_max = RVV_65536;
2132 unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2133 unsigned vlen_min = TARGET_MIN_VLEN;
2134 unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2136 unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2137 if (avl_int <= vlmax_min)
2138 i32vl = gen_int_mode (2 * avl_int, Pmode);
2139 else if (avl_int >= 2 * vlmax_max)
2141 // Just set i32vl to VLMAX in this situation
2142 i32vl = gen_reg_rtx (Pmode);
2143 emit_insn (
2144 gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2146 else
2148 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2149 // is related to the hardware implementation.
2150 // So let the following code handle
2153 if (!i32vl)
2155 // Using vsetvli instruction to get actually used length which related to
2156 // the hardware implementation
2157 rtx i64vl = gen_reg_rtx (Pmode);
2158 emit_insn (
2159 gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2160 // scale 2 for 32-bit length
2161 i32vl = gen_reg_rtx (Pmode);
2162 emit_insn (
2163 gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2166 return force_vector_length_operand (i32vl);
2169 bool
2170 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2171 machine_mode demote_mask_mode, rtx *ops)
2173 rtx scalar_op = ops[4];
2174 rtx avl = ops[5];
2175 machine_mode scalar_mode = GET_MODE_INNER (mode);
2176 if (rtx_equal_p (scalar_op, const0_rtx))
2178 ops[5] = force_vector_length_operand (ops[5]);
2179 return false;
2182 if (TARGET_64BIT)
2184 ops[4] = force_reg (scalar_mode, scalar_op);
2185 ops[5] = force_vector_length_operand (ops[5]);
2186 return false;
2189 if (immediate_operand (scalar_op, Pmode))
2191 ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2192 ops[5] = force_vector_length_operand (ops[5]);
2193 return false;
2196 if (CONST_INT_P (scalar_op))
2197 scalar_op = force_reg (scalar_mode, scalar_op);
2199 rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2201 rtx demote_scalar_op1, demote_scalar_op2;
2202 if (unspec == UNSPEC_VSLIDE1UP)
2204 demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2205 demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2207 else
2209 demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2210 demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2213 rtx temp = gen_reg_rtx (demote_mode);
2214 rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2215 rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2216 rtx merge = RVV_VUNDEF (demote_mode);
2217 /* Handle vslide1<ud>_tu. */
2218 if (register_operand (ops[2], mode)
2219 && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2221 merge = gen_lowpart (demote_mode, ops[2]);
2222 ta = ops[6];
2223 ma = ops[7];
2226 emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2227 CONSTM1_RTX (demote_mask_mode), merge,
2228 gen_lowpart (demote_mode, ops[3]),
2229 demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2230 emit_insn (gen_pred_slide (unspec, demote_mode,
2231 gen_lowpart (demote_mode, ops[0]),
2232 CONSTM1_RTX (demote_mask_mode), merge, temp,
2233 demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2235 if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2236 && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2237 emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2238 force_vector_length_operand (ops[5]), ops[6],
2239 ops[8]));
2240 return true;
2244 gen_avl_for_scalar_move (rtx avl)
2246 /* AVL for scalar move has different behavior between 0 and large than 0. */
2247 if (CONST_INT_P (avl))
2249 /* So we could just set AVL to 1 for any constant other than 0. */
2250 if (rtx_equal_p (avl, const0_rtx))
2251 return const0_rtx;
2252 else
2253 return const1_rtx;
2255 else
2257 /* For non-constant value, we set any non zero value to 1 by
2258 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2259 rtx tmp = gen_reg_rtx (Pmode);
2260 emit_insn (
2261 gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2262 return tmp;
2266 /* Expand tuple modes data movement for. */
2267 void
2268 expand_tuple_move (rtx *ops)
2270 unsigned int i;
2271 machine_mode tuple_mode = GET_MODE (ops[0]);
2272 machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2273 poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2274 unsigned int nf = get_nf (tuple_mode);
2275 bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2277 if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2279 rtx val;
2280 gcc_assert (can_create_pseudo_p ()
2281 && const_vec_duplicate_p (ops[1], &val));
2282 for (i = 0; i < nf; ++i)
2284 poly_int64 offset = i * subpart_size;
2285 rtx subreg
2286 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2287 rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2288 emit_move_insn (subreg, dup);
2291 else if (REG_P (ops[0]) && REG_P (ops[1]))
2293 for (i = 0; i < nf; ++i)
2295 int index = i;
2297 /* Take NF = 2 and LMUL = 1 for example:
2299 - move v8 to v9:
2300 vmv1r v10,v9
2301 vmv1r v9,v8
2303 - move v8 to v7:
2304 vmv1r v7,v8
2305 vmv1r v8,v9 */
2306 if (REGNO (ops[0]) > REGNO (ops[1]))
2307 index = nf - 1 - i;
2308 poly_int64 offset = index * subpart_size;
2309 rtx dst_subreg
2310 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2311 rtx src_subreg
2312 = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2313 emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2316 else
2318 /* Expand tuple memory data movement. */
2319 gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2320 rtx offset = gen_int_mode (subpart_size, Pmode);
2321 if (!subpart_size.is_constant ())
2323 emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2324 if (fractional_p)
2326 unsigned int factor
2327 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2328 .to_constant ();
2329 rtx pat
2330 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2331 gen_int_mode (exact_log2 (factor), Pmode));
2332 emit_insn (gen_rtx_SET (ops[2], pat));
2335 if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2337 unsigned int factor
2338 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2339 .to_constant ();
2340 rtx pat
2341 = gen_rtx_ASHIFT (Pmode, ops[2],
2342 gen_int_mode (exact_log2 (factor), Pmode));
2343 emit_insn (gen_rtx_SET (ops[2], pat));
2345 offset = ops[2];
2348 /* Non-fractional LMUL has whole register moves that don't require a
2349 vsetvl for VLMAX. */
2350 if (fractional_p)
2351 emit_vlmax_vsetvl (subpart_mode, ops[4]);
2352 if (MEM_P (ops[1]))
2354 /* Load operations. */
2355 emit_move_insn (ops[3], XEXP (ops[1], 0));
2356 for (i = 0; i < nf; i++)
2358 rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2359 tuple_mode, i * subpart_size);
2360 if (i != 0)
2362 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2363 emit_insn (gen_rtx_SET (ops[3], new_addr));
2365 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2367 if (fractional_p)
2369 rtx operands[] = {subreg, mem};
2370 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2371 UNARY_OP, operands, ops[4]);
2373 else
2374 emit_move_insn (subreg, mem);
2377 else
2379 /* Store operations. */
2380 emit_move_insn (ops[3], XEXP (ops[0], 0));
2381 for (i = 0; i < nf; i++)
2383 rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2384 tuple_mode, i * subpart_size);
2385 if (i != 0)
2387 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2388 emit_insn (gen_rtx_SET (ops[3], new_addr));
2390 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2392 if (fractional_p)
2394 rtx operands[] = {mem, subreg};
2395 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2396 UNARY_OP, operands, ops[4]);
2398 else
2399 emit_move_insn (mem, subreg);
2405 /* Return the vectorization machine mode for RVV according to LMUL. */
2406 machine_mode
2407 preferred_simd_mode (scalar_mode mode)
2409 if (autovec_use_vlmax_p ())
2411 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2412 rvv_max_lmul as multiply factor to calculate the NUNITS to
2413 get the auto-vectorization mode. */
2414 poly_uint64 nunits;
2415 poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2416 poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2417 /* Disable vectorization when we can't find a RVV mode for it.
2418 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2419 a double (DFmode) type. */
2420 if (!multiple_p (vector_size, scalar_size, &nunits))
2421 return word_mode;
2422 machine_mode rvv_mode;
2423 if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2424 return rvv_mode;
2426 return word_mode;
2429 /* Use merge approach to initialize the vector with repeating sequence.
2430 v = {a, b, a, b, a, b, a, b}.
2432 v = broadcast (a).
2433 mask = 0b01010101....
2434 v = merge (v, b, mask)
2436 static void
2437 expand_vector_init_merge_repeating_sequence (rtx target,
2438 const rvv_builder &builder)
2440 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2441 since we don't have such instruction in RVV.
2442 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2443 instruction to generate the mask data we want. */
2444 machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2445 machine_mode mask_int_mode
2446 = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2447 uint64_t full_nelts = builder.full_nelts ().to_constant ();
2449 /* Step 1: Broadcast the first pattern. */
2450 rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2451 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2452 UNARY_OP, ops);
2453 /* Step 2: Merge the rest iteration of pattern. */
2454 for (unsigned int i = 1; i < builder.npatterns (); i++)
2456 /* Step 2-1: Generate mask register v0 for each merge. */
2457 rtx merge_mask
2458 = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2459 rtx mask = gen_reg_rtx (mask_bit_mode);
2460 rtx dup = gen_reg_rtx (mask_int_mode);
2462 if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
2464 rtx ops[] = {dup, merge_mask};
2465 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2466 SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2468 else /* vmv.v.x. */
2470 rtx ops[] = {dup,
2471 force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2472 rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2473 Pmode);
2474 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2475 ops, vl);
2478 emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2480 /* Step 2-2: Merge pattern according to the mask. */
2481 rtx ops[] = {target, target, builder.elt (i), mask};
2482 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2483 MERGE_OP, ops);
2487 /* Use slideup approach to combine the vectors.
2488 v = {a, a, a, a, b, b, b, b}
2490 First:
2491 v1 = {a, a, a, a, a, a, a, a}
2492 v2 = {b, b, b, b, b, b, b, b}
2493 v = slideup (v1, v2, nelt / 2)
2495 static void
2496 expand_vector_init_slideup_combine_sequence (rtx target,
2497 const rvv_builder &builder)
2499 machine_mode mode = GET_MODE (target);
2500 int nelts = builder.full_nelts ().to_constant ();
2501 rtx first_elt = builder.elt (0);
2502 rtx last_elt = builder.elt (nelts - 1);
2503 rtx low = expand_vector_broadcast (mode, first_elt);
2504 rtx high = expand_vector_broadcast (mode, last_elt);
2505 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2506 rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2507 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2510 /* Use merge approach to merge a scalar into a vector.
2511 v = {a, a, a, a, a, a, b, b}
2513 v1 = {a, a, a, a, a, a, a, a}
2514 scalar = b
2515 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2517 static void
2518 expand_vector_init_merge_combine_sequence (rtx target,
2519 const rvv_builder &builder)
2521 machine_mode mode = GET_MODE (target);
2522 machine_mode imode = builder.int_mode ();
2523 machine_mode mmode = builder.mask_mode ();
2524 int nelts = builder.full_nelts ().to_constant ();
2525 int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2526 if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2527 || riscv_get_v_regno_alignment (imode) > 1)
2528 imode = get_vector_mode (HImode, nelts).require ();
2530 /* Generate vid = { 0, 1, 2, ..., n }. */
2531 rtx vid = gen_reg_rtx (imode);
2532 expand_vec_series (vid, const0_rtx, const1_rtx);
2534 /* Generate mask. */
2535 rtx mask = gen_reg_rtx (mmode);
2536 insn_code icode = code_for_pred_cmp_scalar (imode);
2537 rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2538 rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2539 /* vmsgtu.vi/vmsgtu.vx. */
2540 rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2541 rtx sel = builder.elt (nelts - 1);
2542 rtx mask_ops[] = {mask, cmp, vid, index};
2543 emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2545 /* Duplicate the first elements. */
2546 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2547 /* Merge scalar into vector according to mask. */
2548 rtx merge_ops[] = {target, dup, sel, mask};
2549 icode = code_for_pred_merge_scalar (mode);
2550 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2553 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2555 void
2556 expand_vec_init (rtx target, rtx vals)
2558 machine_mode mode = GET_MODE (target);
2559 int nelts = XVECLEN (vals, 0);
2561 rvv_builder v (mode, nelts, 1);
2562 for (int i = 0; i < nelts; i++)
2563 v.quick_push (XVECEXP (vals, 0, i));
2564 v.finalize ();
2566 /* If the sequence is v = { a, a, a, a } just broadcast an element. */
2567 if (v.is_repeating_sequence ())
2569 machine_mode mode = GET_MODE (target);
2570 rtx dup = expand_vector_broadcast (mode, v.elt (0));
2571 emit_move_insn (target, dup);
2572 return;
2575 if (nelts > 3)
2577 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2578 if (v.can_duplicate_repeating_sequence_p ())
2580 rtx ele = v.get_merged_repeating_sequence ();
2581 rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2582 emit_move_insn (target, gen_lowpart (mode, dup));
2583 return;
2586 /* Case 2: Optimize repeating sequence cases that Case 1 can
2587 not handle and it is profitable. For example:
2588 ELEMENT BITSIZE = 64.
2589 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2590 We can't find a vector mode for "ab" which will be combined into
2591 128-bit element to duplicate. */
2592 if (v.repeating_sequence_use_merge_profitable_p ())
2594 expand_vector_init_merge_repeating_sequence (target, v);
2595 return;
2598 /* Case 3: Optimize combine sequence.
2599 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2600 We can combine:
2601 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2603 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2604 by slideup. */
2605 if (v.combine_sequence_use_slideup_profitable_p ())
2607 expand_vector_init_slideup_combine_sequence (target, v);
2608 return;
2611 /* Case 4: Optimize combine sequence.
2612 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2614 Generate vector:
2615 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2617 Generate mask:
2618 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2620 Merge b into v by mask:
2621 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2622 if (v.combine_sequence_use_merge_profitable_p ())
2624 expand_vector_init_merge_combine_sequence (target, v);
2625 return;
2629 /* Optimize trailing same elements sequence:
2630 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2631 if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2632 /* Handle common situation by vslide1down. This function can handle any
2633 situation of vec_init<mode>. Only the cases that are not optimized above
2634 will fall through here. */
2635 expand_vector_init_insert_elems (target, v, nelts);
2638 /* Get insn code for corresponding comparison. */
2640 static insn_code
2641 get_cmp_insn_code (rtx_code code, machine_mode mode)
2643 insn_code icode;
2644 switch (code)
2646 case EQ:
2647 case NE:
2648 case LE:
2649 case LEU:
2650 case GT:
2651 case GTU:
2652 case LTGT:
2653 icode = code_for_pred_cmp (mode);
2654 break;
2655 case LT:
2656 case LTU:
2657 case GE:
2658 case GEU:
2659 if (FLOAT_MODE_P (mode))
2660 icode = code_for_pred_cmp (mode);
2661 else
2662 icode = code_for_pred_ltge (mode);
2663 break;
2664 default:
2665 gcc_unreachable ();
2667 return icode;
2670 /* This hook gives the vectorizer more vector mode options. We want it to not
2671 only try modes with the maximum number of units a full vector can hold but
2672 for example also half the number of units for a smaller elements size.
2673 Such vectors can be promoted to a full vector of widened elements
2674 (still with the same number of elements, essentially vectorizing at a
2675 fixed number of units rather than a fixed number of bytes). */
2676 unsigned int
2677 autovectorize_vector_modes (vector_modes *modes, bool)
2679 if (autovec_use_vlmax_p ())
2681 poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2683 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2684 fit a whole vector.
2685 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2686 is guided by the extensions we have available (vf2, vf4 and vf8).
2688 - full_size: Try using full vectors for all element types.
2689 - full_size / 2:
2690 Try using 16-bit containers for 8-bit elements and full vectors
2691 for wider elements.
2692 - full_size / 4:
2693 Try using 32-bit containers for 8-bit and 16-bit elements and
2694 full vectors for wider elements.
2695 - full_size / 8:
2696 Try using 64-bit containers for all element types. */
2697 static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2698 for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2700 poly_uint64 units;
2701 machine_mode mode;
2702 if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2703 && get_vector_mode (QImode, units).exists (&mode))
2704 modes->safe_push (mode);
2707 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2708 unsigned int i = 0;
2709 unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2710 unsigned int size = base_size;
2711 machine_mode mode;
2712 while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2714 if (vls_mode_valid_p (mode))
2715 modes->safe_push (mode);
2717 i++;
2718 size = base_size / (1U << i);
2720 /* Enable LOOP_VINFO comparison in COST model. */
2721 return VECT_COMPARE_COSTS;
2724 /* Return true if we can find the related MODE according to default LMUL. */
2725 static bool
2726 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2727 poly_uint64 *nunits)
2729 if (!autovec_use_vlmax_p ())
2730 return false;
2731 if (riscv_v_ext_vector_mode_p (vector_mode)
2732 && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2733 GET_MODE_SIZE (element_mode), nunits))
2734 return true;
2735 if (riscv_v_ext_vls_mode_p (vector_mode)
2736 && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2737 GET_MODE_SIZE (element_mode), nunits))
2738 return true;
2739 return false;
2742 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2743 of units that fit into a full vector at the given ELEMENT_MODE.
2744 We will have the vectorizer call us with a successively decreasing
2745 number of units (as specified in autovectorize_vector_modes).
2746 The starting mode is always the one specified by preferred_simd_mode. */
2747 opt_machine_mode
2748 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2749 poly_uint64 nunits)
2751 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2752 poly_uint64 min_units;
2753 if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2755 machine_mode rvv_mode;
2756 if (maybe_ne (nunits, 0U))
2758 /* If we were given a number of units NUNITS, try to find an
2759 RVV vector mode of inner mode ELEMENT_MODE with the same
2760 number of units. */
2761 if (multiple_p (min_units, nunits)
2762 && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2763 return rvv_mode;
2765 else
2767 /* Look for a vector mode with the same number of units as the
2768 VECTOR_MODE we were given. We keep track of the minimum
2769 number of units so far which determines the smallest necessary
2770 but largest possible, suitable mode for vectorization. */
2771 min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2772 if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2773 return rvv_mode;
2777 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2780 /* Expand an RVV comparison. */
2782 void
2783 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask,
2784 rtx maskoff)
2786 machine_mode mask_mode = GET_MODE (target);
2787 machine_mode data_mode = GET_MODE (op0);
2788 insn_code icode = get_cmp_insn_code (code, data_mode);
2790 if (code == LTGT)
2792 rtx lt = gen_reg_rtx (mask_mode);
2793 rtx gt = gen_reg_rtx (mask_mode);
2794 expand_vec_cmp (lt, LT, op0, op1, mask, maskoff);
2795 expand_vec_cmp (gt, GT, op0, op1, mask, maskoff);
2796 icode = code_for_pred (IOR, mask_mode);
2797 rtx ops[] = {target, lt, gt};
2798 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2799 return;
2802 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2803 if (!mask && !maskoff)
2805 rtx ops[] = {target, cmp, op0, op1};
2806 emit_vlmax_insn (icode, COMPARE_OP, ops);
2808 else
2810 rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2811 emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2815 /* Expand an RVV floating-point comparison:
2817 If CAN_INVERT_P is true, the caller can also handle inverted results;
2818 return true if the result is in fact inverted. */
2820 bool
2821 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2822 bool can_invert_p)
2824 machine_mode mask_mode = GET_MODE (target);
2825 machine_mode data_mode = GET_MODE (op0);
2827 /* If can_invert_p = true:
2828 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2830 vmfeq.vv v0, va, va
2831 vmfeq.vv v1, vb, vb
2832 vmand.mm v0, v0, v1
2833 vmflt.vv v0, va, vb, v0.t
2834 vmnot.m v0, v0
2836 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2837 second vmfeq.vv:
2839 vmfeq.vv v0, va, va
2840 vmfeq.vv v0, vb, vb, v0.t
2841 vmflt.vv v0, va, vb, v0.t
2842 vmnot.m v0, v0
2844 If can_invert_p = false:
2846 # Example of implementing isgreater()
2847 vmfeq.vv v0, va, va # Only set where A is not NaN.
2848 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2849 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2850 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2853 rtx eq0 = gen_reg_rtx (mask_mode);
2854 rtx eq1 = gen_reg_rtx (mask_mode);
2855 switch (code)
2857 case EQ:
2858 case NE:
2859 case LT:
2860 case LE:
2861 case GT:
2862 case GE:
2863 case LTGT:
2864 /* There is native support for the comparison. */
2865 expand_vec_cmp (target, code, op0, op1);
2866 return false;
2867 case UNEQ:
2868 case ORDERED:
2869 case UNORDERED:
2870 case UNLT:
2871 case UNLE:
2872 case UNGT:
2873 case UNGE:
2874 /* vmfeq.vv v0, va, va */
2875 expand_vec_cmp (eq0, EQ, op0, op0);
2876 if (HONOR_SNANS (data_mode))
2879 vmfeq.vv v1, vb, vb
2880 vmand.mm v0, v0, v1
2882 expand_vec_cmp (eq1, EQ, op1, op1);
2883 insn_code icode = code_for_pred (AND, mask_mode);
2884 rtx ops[] = {eq0, eq0, eq1};
2885 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2887 else
2889 /* vmfeq.vv v0, vb, vb, v0.t */
2890 expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0);
2892 break;
2893 default:
2894 gcc_unreachable ();
2897 if (code == ORDERED)
2899 emit_move_insn (target, eq0);
2900 return false;
2903 /* There is native support for the inverse comparison. */
2904 code = reverse_condition_maybe_unordered (code);
2905 if (code == ORDERED)
2906 emit_move_insn (target, eq0);
2907 else
2908 expand_vec_cmp (eq0, code, op0, op1, eq0, eq0);
2910 if (can_invert_p)
2912 emit_move_insn (target, eq0);
2913 return true;
2916 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2917 into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm. */
2918 emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2919 return false;
2922 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2923 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2924 2 * nunits - 1. */
2925 static rtx
2926 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2928 rtx sel_mod;
2929 machine_mode sel_mode = GET_MODE (sel);
2930 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2931 poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2932 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2933 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2934 indice. */
2935 if (CONST_VECTOR_P (sel)
2936 && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2937 sel_mod = sel;
2938 else
2940 rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2941 sel_mod
2942 = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2944 return sel_mod;
2947 /* Implement vec_perm<mode>. */
2949 void
2950 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2952 machine_mode data_mode = GET_MODE (target);
2953 machine_mode sel_mode = GET_MODE (sel);
2954 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2956 /* Check if the sel only references the first values vector. If each select
2957 index is in range of [0, nunits - 1]. A single vrgather instructions is
2958 enough. Since we will use vrgatherei16.vv for variable-length vector,
2959 it is never out of range and we don't need to modulo the index. */
2960 if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2962 emit_vlmax_gather_insn (target, op0, sel);
2963 return;
2966 /* Check if all the indices are same. */
2967 rtx elt;
2968 if (const_vec_duplicate_p (sel, &elt))
2970 poly_uint64 value = rtx_to_poly_int64 (elt);
2971 rtx op = op0;
2972 if (maybe_gt (value, nunits - 1))
2974 sel = gen_const_vector_dup (sel_mode, value - nunits);
2975 op = op1;
2977 emit_vlmax_gather_insn (target, op, sel);
2980 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2981 size of the two value vectors, i.e. the upper bits of the indices
2982 are effectively ignored. RVV vrgather instead produces 0 for any
2983 out-of-range indices, so we need to modulo all the vec_perm indices
2984 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2985 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2986 rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2988 /* Check if the two values vectors are the same. */
2989 if (rtx_equal_p (op0, op1))
2991 emit_vlmax_gather_insn (target, op0, sel_mod);
2992 return;
2995 /* This following sequence is handling the case that:
2996 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2997 value in range of [0, 2 * nunits - 1]. */
2998 machine_mode mask_mode;
2999 mask_mode = get_mask_mode (data_mode);
3000 rtx mask = gen_reg_rtx (mask_mode);
3001 rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
3003 /* Step 1: generate a mask that should select everything >= nunits into the
3004 * mask. */
3005 expand_vec_cmp (mask, GEU, sel_mod, max_sel);
3007 /* Step2: gather every op0 values indexed by sel into target,
3008 we don't need to care about the result of the element
3009 whose index >= nunits. */
3010 emit_vlmax_gather_insn (target, op0, sel_mod);
3012 /* Step3: shift the range from (nunits, max_of_mode] to
3013 [0, max_of_mode - nunits]. */
3014 rtx tmp = gen_reg_rtx (sel_mode);
3015 rtx ops[] = {tmp, sel_mod, max_sel};
3016 emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
3018 /* Step4: gather those into the previously masked-out elements
3019 of target. */
3020 emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
3023 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3025 /* vec_perm support. */
3027 struct expand_vec_perm_d
3029 rtx target, op0, op1;
3030 vec_perm_indices perm;
3031 machine_mode vmode;
3032 machine_mode op_mode;
3033 bool one_vector_p;
3034 bool testing_p;
3037 /* Return the appropriate index mode for gather instructions. */
3038 opt_machine_mode
3039 get_gather_index_mode (struct expand_vec_perm_d *d)
3041 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3042 poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3044 if (GET_MODE_INNER (d->vmode) == QImode)
3046 if (nunits.is_constant ())
3048 /* If indice is LMUL8 CONST_VECTOR and any element value
3049 exceed the range of 0 ~ 255, Forbid such permutation
3050 since we need vector HI mode to hold such indice and
3051 we don't have it. */
3052 if (!d->perm.all_in_range_p (0, 255)
3053 && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3054 return opt_machine_mode ();
3056 else
3058 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3059 Otherwise, it could overflow the index range. */
3060 if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3061 return opt_machine_mode ();
3064 else if (riscv_get_v_regno_alignment (sel_mode) > 1
3065 && GET_MODE_INNER (sel_mode) != HImode)
3066 sel_mode = get_vector_mode (HImode, nunits).require ();
3067 return sel_mode;
3070 /* Recognize the patterns that we can use merge operation to shuffle the
3071 vectors. The value of Each element (index i) in selector can only be
3072 either i or nunits + i. We will check the pattern is actually monotonic.
3074 E.g.
3075 v = VEC_PERM_EXPR (v0, v1, selector),
3076 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3078 We can transform such pattern into:
3080 v = vcond_mask (v0, v1, mask),
3081 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3083 static bool
3084 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3086 machine_mode vmode = d->vmode;
3087 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3088 int n_patterns = d->perm.encoding ().npatterns ();
3089 poly_int64 vec_len = d->perm.length ();
3091 for (int i = 0; i < n_patterns; ++i)
3092 if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3093 return false;
3095 /* Check the pattern is monotonic here, otherwise, return false. */
3096 for (int i = n_patterns; i < n_patterns * 2; i++)
3097 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3098 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3099 return false;
3101 /* We need to use precomputed mask for such situation and such mask
3102 can only be computed in compile-time known size modes. */
3103 bool indices_fit_selector_p
3104 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3105 if (!indices_fit_selector_p && !vec_len.is_constant ())
3106 return false;
3108 if (d->testing_p)
3109 return true;
3111 machine_mode mask_mode = get_mask_mode (vmode);
3112 rtx mask = gen_reg_rtx (mask_mode);
3114 if (indices_fit_selector_p && vec_len.is_constant ())
3116 /* For a constant vector length we can generate the needed mask at
3117 compile time and load it as mask at runtime.
3118 This saves a compare at runtime. */
3119 rtx_vector_builder sel (mask_mode, d->perm.encoding ().npatterns (),
3120 d->perm.encoding ().nelts_per_pattern ());
3121 unsigned int encoded_nelts = sel.encoded_nelts ();
3122 for (unsigned int i = 0; i < encoded_nelts; i++)
3123 sel.quick_push (gen_int_mode (d->perm[i].to_constant ()
3124 < vec_len.to_constant (),
3125 GET_MODE_INNER (mask_mode)));
3126 mask = sel.build ();
3128 else if (indices_fit_selector_p)
3130 /* For a dynamic vector length < 256 we keep the permutation
3131 indices in the literal pool, load it at runtime and create the
3132 mask by selecting either OP0 or OP1 by
3134 INDICES < NUNITS ? 1 : 0. */
3135 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3136 rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3137 insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3138 rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3139 rtx ops[] = {mask, cmp, sel, x};
3140 emit_vlmax_insn (icode, COMPARE_OP, ops);
3142 else
3144 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3145 directly to generate the selector mask, instead, we can only use
3146 precomputed mask.
3148 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3149 don't have a QImode scalar register to hold larger than 255.
3150 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3151 since there is no larger HI mode vector we cannot create a larger
3152 selector.
3154 As the mask is a simple {0, 1, ...} pattern and the length is known we
3155 can store it in a scalar register and broadcast it to a mask register.
3157 gcc_assert (vec_len.is_constant ());
3158 int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3159 machine_mode mode = get_vector_mode (QImode, size).require ();
3160 rtx tmp = gen_reg_rtx (mode);
3161 rvv_builder v (mode, 1, size);
3162 for (int i = 0; i < vec_len.to_constant () / 8; i++)
3164 uint8_t value = 0;
3165 for (int j = 0; j < 8; j++)
3167 int index = i * 8 + j;
3168 if (known_lt (d->perm[index], 256))
3169 value |= 1 << j;
3171 v.quick_push (gen_int_mode (value, QImode));
3173 emit_move_insn (tmp, v.build ());
3174 emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3177 /* TARGET = MASK ? OP0 : OP1. */
3178 /* swap op0 and op1 since the order is opposite to pred_merge. */
3179 rtx ops2[] = {d->target, d->op1, d->op0, mask};
3180 emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3181 return true;
3184 /* Recognize the consecutive index that we can use a single
3185 vrgather.v[x|i] to shuffle the vectors.
3187 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3188 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3189 static bool
3190 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3192 machine_mode vmode = d->vmode;
3193 scalar_mode smode = GET_MODE_INNER (vmode);
3194 poly_int64 vec_len = d->perm.length ();
3195 HOST_WIDE_INT elt;
3197 if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3198 return false;
3199 int vlen = vec_len.to_constant ();
3201 /* Compute the last element index of consecutive pattern from the leading
3202 consecutive elements. */
3203 int last_consecutive_idx = -1;
3204 int consecutive_num = -1;
3205 for (int i = 1; i < vlen; i++)
3207 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3208 break;
3209 last_consecutive_idx = i;
3210 consecutive_num = last_consecutive_idx + 1;
3213 int new_vlen = vlen / consecutive_num;
3214 if (last_consecutive_idx < 0 || consecutive_num == vlen
3215 || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3216 return false;
3217 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3218 All elements of index, index + 1, ... index + consecutive_num - 1 should
3219 locate at the same vector. */
3220 if (maybe_ge (d->perm[0], vec_len)
3221 != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3222 return false;
3223 /* If a vector has 8 elements. We allow optimizations on consecutive
3224 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3225 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3226 to be optimized. */
3227 if (d->perm[0].to_constant () % consecutive_num != 0)
3228 return false;
3229 unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3230 if (container_bits > 64)
3231 return false;
3232 else if (container_bits == 64)
3234 if (!TARGET_VECTOR_ELEN_64)
3235 return false;
3236 else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3237 return false;
3240 /* Check the rest of elements are the same consecutive pattern. */
3241 for (int i = consecutive_num; i < vlen; i++)
3242 if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3243 return false;
3245 if (FLOAT_MODE_P (smode))
3246 smode = float_mode_for_size (container_bits).require ();
3247 else
3248 smode = int_mode_for_size (container_bits, 0).require ();
3249 if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3250 return false;
3251 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3253 /* Success! */
3254 if (d->testing_p)
3255 return true;
3257 int index = elt / consecutive_num;
3258 if (index >= new_vlen)
3259 index = index - new_vlen;
3260 rtx sel = gen_const_vector_dup (sel_mode, index);
3261 rtx op = elt >= vlen ? d->op0 : d->op1;
3262 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3263 gen_lowpart (vmode, op), sel);
3264 return true;
3267 /* Recognize the patterns that we can use compress operation to shuffle the
3268 vectors. The perm selector of compress pattern is divided into 2 part:
3269 The first part is the random index number < NUNITS.
3270 The second part is consecutive last N index number >= NUNITS.
3272 E.g.
3273 v = VEC_PERM_EXPR (v0, v1, selector),
3274 selector = { 0, 2, 6, 7 }
3276 We can transform such pattern into:
3278 op1 = vcompress (op0, mask)
3279 mask = { 1, 0, 1, 0 }
3280 v = op1. */
3282 static bool
3283 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3285 machine_mode vmode = d->vmode;
3286 poly_int64 vec_len = d->perm.length ();
3288 if (!vec_len.is_constant ())
3289 return false;
3291 int vlen = vec_len.to_constant ();
3293 /* It's not worthwhile the compress pattern has elements < 4
3294 and we can't modulo indices for compress pattern. */
3295 if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3296 return false;
3298 /* Compress pattern doesn't work for one vector. */
3299 if (d->one_vector_p)
3300 return false;
3302 /* Compress point is the point that all elements value with index i >=
3303 compress point of the selector are all consecutive series increasing and
3304 each selector value >= NUNITS. In this case, we could compress all elements
3305 of i < compress point into the op1. */
3306 int compress_point = -1;
3307 for (int i = 0; i < vlen; i++)
3309 if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3311 compress_point = i;
3312 break;
3316 /* We don't apply compress approach if we can't find the compress point. */
3317 if (compress_point < 0)
3318 return false;
3320 /* We can only apply compress approach when all index values from 0 to
3321 compress point are increasing. */
3322 for (int i = 1; i < compress_point; i++)
3323 if (maybe_le (d->perm[i], d->perm[i - 1]))
3324 return false;
3326 /* It must be series increasing from compress point. */
3327 for (int i = 1 + compress_point; i < vlen; i++)
3328 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3329 return false;
3331 /* Success! */
3332 if (d->testing_p)
3333 return true;
3335 /* Check whether we need to slideup op1 to apply compress approach.
3337 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3338 is 2 * NUNITS - 1, so we don't need to slide up.
3340 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3341 we apply compress approach. */
3342 bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3343 && !const_vec_duplicate_p (d->op1);
3345 /* If we leave it directly be handled by general gather,
3346 the code sequence will be:
3347 VECTOR LOAD selector
3348 GEU mask, selector, NUNITS
3349 GATHER dest, op0, selector
3350 SUB selector, selector, NUNITS
3351 GATHER dest, op1, selector, mask
3352 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3353 as COST = 4. So, we consider the general gather handling COST = 9.
3354 TODO: This cost is not accurate, we can adjust it by tune info. */
3355 int general_cost = 9;
3357 /* If we can use compress approach, the code sequence will be:
3358 MASK LOAD mask
3359 COMPRESS op1, op0, mask
3360 If it needs slide up, it will be:
3361 MASK LOAD mask
3362 SLIDEUP op1
3363 COMPRESS op1, op0, mask
3364 By default, mask load COST = 2.
3365 TODO: This cost is not accurate, we can adjust it by tune info. */
3366 int compress_cost = 4;
3368 if (general_cost <= compress_cost)
3369 return false;
3371 /* Build a mask that is true when selector element is true. */
3372 machine_mode mask_mode = get_mask_mode (vmode);
3373 rvv_builder builder (mask_mode, vlen, 1);
3374 for (int i = 0; i < vlen; i++)
3376 bool is_compress_index = false;
3377 for (int j = 0; j < compress_point; j++)
3379 if (known_eq (d->perm[j], i))
3381 is_compress_index = true;
3382 break;
3385 if (is_compress_index)
3386 builder.quick_push (CONST1_RTX (BImode));
3387 else
3388 builder.quick_push (CONST0_RTX (BImode));
3390 rtx mask = force_reg (mask_mode, builder.build ());
3392 rtx merge = d->op1;
3393 if (need_slideup_p)
3395 int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3396 merge = gen_reg_rtx (vmode);
3397 rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3398 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3399 emit_vlmax_insn (icode, BINARY_OP, ops);
3402 insn_code icode = code_for_pred_compress (vmode);
3403 rtx ops[] = {d->target, merge, d->op0, mask};
3404 emit_nonvlmax_insn (icode, COMPRESS_OP_MERGE, ops,
3405 gen_int_mode (vlen, Pmode));
3406 return true;
3409 /* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower
3410 or the higher parts of both vectors are combined into one. */
3412 static bool
3413 shuffle_slide_patterns (struct expand_vec_perm_d *d)
3415 machine_mode vmode = d->vmode;
3416 poly_int64 vec_len = d->perm.length ();
3418 if (!vec_len.is_constant ())
3419 return false;
3421 int vlen = vec_len.to_constant ();
3422 if (vlen < 4)
3423 return false;
3425 if (d->one_vector_p)
3426 return false;
3428 /* For a slideup OP0 can stay, for a slidedown OP1 can.
3429 The former requires that the first element of the permutation
3430 is the first element of OP0, the latter that the last permutation
3431 element is the last element of OP1. */
3432 bool slideup = false;
3433 bool slidedown = false;
3435 /* For a slideup the permutation must start at OP0's first element. */
3436 if (known_eq (d->perm[0], 0))
3437 slideup = true;
3439 /* For a slidedown the permutation must end at OP1's last element. */
3440 if (known_eq (d->perm[vlen - 1], 2 * vlen - 1))
3441 slidedown = true;
3443 if (slideup && slidedown)
3444 return false;
3446 if (!slideup && !slidedown)
3447 return false;
3449 /* Check for a monotonic sequence with one pivot. */
3450 int pivot = -1;
3451 for (int i = 0; i < vlen; i++)
3453 if (pivot == -1 && known_ge (d->perm[i], vec_len))
3454 pivot = i;
3455 if (i > 0 && i != pivot
3456 && maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3457 return false;
3460 if (pivot == -1)
3461 return false;
3463 /* For a slideup OP1's part (to be slid up) must be a low part,
3464 i.e. starting with its first element. */
3465 if (slideup && maybe_ne (d->perm[pivot], vlen))
3466 return false;
3468 /* For a slidedown OP0's part (to be slid down) must be a high part,
3469 i.e. ending with its last element. */
3470 if (slidedown && maybe_ne (d->perm[pivot - 1], vlen - 1))
3471 return false;
3473 /* Success! */
3474 if (d->testing_p)
3475 return true;
3477 /* PIVOT is the start of the lower/higher part of OP1 or OP2.
3478 For a slideup it indicates how many elements of OP1 to
3479 skip/slide over. For a slidedown it indicates how long
3480 OP1's high part is, while VLEN - PIVOT is the amount to slide. */
3481 int slide_cnt = slideup ? pivot : vlen - pivot;
3482 insn_code icode;
3483 if (slideup)
3485 /* No need for a vector length because we slide up until the
3486 end of OP1 anyway. */
3487 rtx ops[] = {d->target, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
3488 icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3489 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3491 else
3493 /* Here we need a length because we slide to the beginning of OP1
3494 leaving the remaining elements undisturbed. */
3495 int len = pivot;
3496 rtx ops[] = {d->target, d->op1, d->op0,
3497 gen_int_mode (slide_cnt, Pmode)};
3498 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
3499 emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
3500 gen_int_mode (len, Pmode));
3503 return true;
3506 /* Recognize interleaving patterns like [0 4 1 5]. */
3508 static bool
3509 shuffle_interleave_patterns (struct expand_vec_perm_d *d)
3511 machine_mode vmode = d->vmode;
3512 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3513 poly_int64 vec_len = d->perm.length ();
3514 int n_patterns = d->perm.encoding ().npatterns ();
3516 if (!vec_len.is_constant ())
3517 return false;
3519 if (n_patterns != 2)
3520 return false;
3522 unsigned vlen = vec_len.to_constant ();
3524 if (vlen < 4 || vlen > 64)
3525 return false;
3527 if (d->one_vector_p)
3528 return false;
3530 bool low = true;
3531 if (d->perm.series_p (0, 2, 0, 1)
3532 && d->perm.series_p (1, 2, vlen, 1))
3533 low = true;
3534 else if (d->perm.series_p (0, 2, vlen / 2, 1)
3535 && d->perm.series_p (1, 2, vlen + vlen / 2, 1))
3536 low = false;
3537 else
3538 return false;
3540 vec_perm_builder sel (vlen, 2, 1);
3541 sel.safe_grow (vlen);
3542 int cnt = 0;
3543 for (unsigned i = 0; i < vlen; i += 2)
3545 sel[i] = cnt;
3546 sel[i + 1] = cnt + vlen / 2;
3547 cnt++;
3550 vec_perm_indices indices (sel, 2, vlen);
3552 if (vlen != indices.length ().to_constant ())
3553 return false;
3555 /* Success! */
3556 if (d->testing_p)
3557 return true;
3559 int slide_cnt = vlen / 2;
3560 rtx tmp = gen_reg_rtx (vmode);
3562 if (low)
3564 /* No need for a vector length because we slide up until the
3565 end of OP1 anyway. */
3566 rtx ops[] = {tmp, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)};
3567 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3568 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3570 else
3572 rtx ops[] = {tmp, d->op1, d->op0, gen_int_mode (slide_cnt, Pmode)};
3573 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode);
3574 emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops,
3575 gen_int_mode (slide_cnt, Pmode));
3578 rtx sel_rtx = vec_perm_indices_to_rtx (sel_mode, indices);
3579 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target), tmp, sel_rtx);
3581 return true;
3585 /* Recognize even/odd patterns like [0 2 4 6]. We use two compress
3586 and one slideup. */
3588 static bool
3589 shuffle_even_odd_patterns (struct expand_vec_perm_d *d)
3591 machine_mode vmode = d->vmode;
3592 poly_int64 vec_len = d->perm.length ();
3593 int n_patterns = d->perm.encoding ().npatterns ();
3595 if (n_patterns != 1)
3596 return false;
3598 if (!vec_len.is_constant ())
3599 return false;
3601 int vlen = vec_len.to_constant ();
3602 if (vlen < 4 || vlen > 64)
3603 return false;
3605 if (d->one_vector_p)
3606 return false;
3608 bool even = true;
3609 if (!d->perm.series_p (0, 1, 0, 2))
3611 even = false;
3612 if (!d->perm.series_p (0, 1, 1, 2))
3613 return false;
3616 /* Success! */
3617 if (d->testing_p)
3618 return true;
3620 machine_mode mask_mode = get_mask_mode (vmode);
3621 rvv_builder builder (mask_mode, vlen, 1);
3622 int bit = even ? 0 : 1;
3623 for (int i = 0; i < vlen; i++)
3625 bit ^= 1;
3626 if (bit)
3627 builder.quick_push (CONST1_RTX (BImode));
3628 else
3629 builder.quick_push (CONST0_RTX (BImode));
3631 rtx mask = force_reg (mask_mode, builder.build ());
3633 insn_code icode = code_for_pred_compress (vmode);
3634 rtx ops1[] = {d->target, d->op0, mask};
3635 emit_vlmax_insn (icode, COMPRESS_OP, ops1);
3637 rtx tmp2 = gen_reg_rtx (vmode);
3638 rtx ops2[] = {tmp2, d->op1, mask};
3639 emit_vlmax_insn (icode, COMPRESS_OP, ops2);
3641 rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)};
3642 icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3643 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
3645 return true;
3648 /* Recognize decompress patterns:
3650 1. VEC_PERM_EXPR op0 and op1
3651 with isel = { 0, nunits, 1, nunits + 1, ... }.
3652 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3654 2. VEC_PERM_EXPR op0 and op1
3655 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3656 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3657 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3659 static bool
3660 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3662 poly_uint64 nelt = d->perm.length ();
3663 machine_mode mask_mode = get_mask_mode (d->vmode);
3665 /* For constant size indices, we dont't need to handle it here.
3666 Just leave it to vec_perm<mode>. */
3667 if (d->perm.length ().is_constant ())
3668 return false;
3670 poly_uint64 first = d->perm[0];
3671 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3672 || !d->perm.series_p (0, 2, first, 1)
3673 || !d->perm.series_p (1, 2, first + nelt, 1))
3674 return false;
3676 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3677 Otherwise, it could overflow the index range. */
3678 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3679 if (GET_MODE_INNER (d->vmode) == QImode
3680 && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3681 return false;
3683 /* Success! */
3684 if (d->testing_p)
3685 return true;
3687 rtx op0, op1;
3688 if (known_eq (first, 0U))
3690 op0 = d->op0;
3691 op1 = d->op1;
3693 else
3695 op0 = gen_reg_rtx (d->vmode);
3696 op1 = gen_reg_rtx (d->vmode);
3697 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3698 rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3699 rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3700 emit_vlmax_insn (icode, BINARY_OP, ops0);
3701 emit_vlmax_insn (icode, BINARY_OP, ops1);
3703 /* Generate { 0, 1, .... } mask. */
3704 rtx vid = gen_reg_rtx (sel_mode);
3705 rtx vid_repeat = gen_reg_rtx (sel_mode);
3706 expand_vec_series (vid, const0_rtx, const1_rtx);
3707 rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3708 emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3709 rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3710 rtx mask = gen_reg_rtx (mask_mode);
3711 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3712 emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3713 return true;
3716 static bool
3717 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3719 HOST_WIDE_INT diff;
3720 unsigned i, size, step;
3722 if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3723 return false;
3725 step = diff + 1;
3726 size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3728 switch (size)
3730 case 16:
3731 break;
3732 case 32:
3733 case 64:
3734 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3735 __builtin_bswap. It will generate about 9 instructions in
3736 loop as below, no matter it is bswap16, bswap32 or bswap64.
3737 .L2:
3738 1 vle16.v v4,0(a0)
3739 2 vmv.v.x v2,a7
3740 3 vand.vv v2,v6,v2
3741 4 slli a2,a5,1
3742 5 vrgatherei16.vv v1,v4,v2
3743 6 sub a4,a4,a5
3744 7 vse16.v v1,0(a3)
3745 8 add a0,a0,a2
3746 9 add a3,a3,a2
3747 bne a4,zero,.L2
3749 But for bswap16 we may have a even simple code gen, which
3750 has only 7 instructions in loop as below.
3752 1 vle8.v v2,0(a5)
3753 2 addi a5,a5,32
3754 3 vsrl.vi v4,v2,8
3755 4 vsll.vi v2,v2,8
3756 5 vor.vv v4,v4,v2
3757 6 vse8.v v4,0(a4)
3758 7 addi a4,a4,32
3759 bne a5,a6,.L5
3761 Unfortunately, the instructions in loop will grow to 13 and 24
3762 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3763 for both the bswap64 and bswap32, but take shift and or (7 insn)
3764 for bswap16.
3766 default:
3767 return false;
3770 for (i = 0; i < step; i++)
3771 if (!d->perm.series_p (i, step, diff - i, step))
3772 return false;
3774 /* Disable when nunits < 4 since the later generic approach
3775 is more profitable on BSWAP. */
3776 if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3777 return false;
3779 if (d->testing_p)
3780 return true;
3782 machine_mode vhi_mode;
3783 poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3785 if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3786 return false;
3788 /* Step-1: Move op0 to src with VHI mode. */
3789 rtx src = gen_reg_rtx (vhi_mode);
3790 emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3792 /* Step-2: Shift right 8 bits to dest. */
3793 rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3794 NULL_RTX, 0, OPTAB_DIRECT);
3796 /* Step-3: Shift left 8 bits to src. */
3797 src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3798 NULL_RTX, 0, OPTAB_DIRECT);
3800 /* Step-4: Logic Or dest and src to dest. */
3801 dest = expand_binop (vhi_mode, ior_optab, dest, src,
3802 NULL_RTX, 0, OPTAB_DIRECT);
3804 /* Step-5: Move src to target with VQI mode. */
3805 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3807 return true;
3810 /* Recognize patterns like [3 4 5 6] where we combine the last element
3811 of the first vector and the first n - 1 elements of the second vector.
3812 This can be implemented by slides or by extracting and re-inserting
3813 (slide1up) the first vector's last element. */
3815 static bool
3816 shuffle_off_by_one_patterns (struct expand_vec_perm_d *d)
3818 poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3820 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3821 if (!d->perm.series_p (0, 2, nunits - 1, 2)
3822 || !d->perm.series_p (1, 2, nunits, 2))
3823 return false;
3825 /* Disable when nunits < 4 since the later generic approach
3826 is more profitable on indice = { nunits - 1, nunits }. */
3827 if (!known_gt (nunits, 2))
3828 return false;
3830 /* Success! */
3831 if (d->testing_p)
3832 return true;
3834 int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS)
3835 + riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2;
3836 int slide_cost = 2;
3838 if (slide_cost < scalar_cost)
3840 /* This variant should always be preferable because we just need two
3841 slides. The extract-variant also requires two slides but additionally
3842 pays the latency for register-file crossing. */
3843 rtx tmp = gen_reg_rtx (d->vmode);
3844 rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)};
3845 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode);
3846 emit_vlmax_insn (icode, BINARY_OP, ops);
3848 rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)};
3849 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3850 emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, Pmode));
3852 else
3854 /* Extract the last element of the first vector. */
3855 scalar_mode smode = GET_MODE_INNER (d->vmode);
3856 rtx tmp = gen_reg_rtx (smode);
3857 emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3859 /* Insert the scalar into element 0. */
3860 unsigned int unspec
3861 = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3862 insn_code icode = code_for_pred_slide (unspec, d->vmode);
3863 rtx ops[] = {d->target, d->op1, tmp};
3864 emit_vlmax_insn (icode, BINARY_OP, ops);
3867 return true;
3870 /* This looks for a series pattern in the provided vector permute structure D.
3871 If successful it emits a series insn as well as a gather to implement it.
3872 Return true if successful, false otherwise. */
3874 static bool
3875 shuffle_series_patterns (struct expand_vec_perm_d *d)
3877 if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3878 return false;
3880 poly_int64 el1 = d->perm[0];
3881 poly_int64 el2 = d->perm[1];
3882 poly_int64 el3 = d->perm[2];
3884 poly_int64 step1 = el2 - el1;
3885 poly_int64 step2 = el3 - el2;
3887 bool need_insert = false;
3888 bool have_series = false;
3890 /* Check for a full series. */
3891 if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3892 have_series = true;
3894 /* Check for a series starting at the second element. */
3895 else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3897 have_series = true;
3898 need_insert = true;
3901 if (!have_series)
3902 return false;
3904 /* Disable shuffle if we can't find an appropriate integer index mode for
3905 gather. */
3906 machine_mode sel_mode;
3907 if (!get_gather_index_mode (d).exists (&sel_mode))
3908 return false;
3910 /* Success! */
3911 if (d->testing_p)
3912 return true;
3914 /* Create the series. */
3915 machine_mode eltmode = Pmode;
3916 rtx series = gen_reg_rtx (sel_mode);
3917 expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3918 gen_int_mode (need_insert ? step2 : step1, eltmode));
3920 /* Insert the remaining element if necessary. */
3921 if (need_insert)
3923 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3924 rtx ops[]
3925 = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3926 emit_vlmax_insn (icode, BINARY_OP, ops);
3929 emit_vlmax_gather_insn (d->target, d->op0, series);
3931 return true;
3934 /* Recognize the pattern that can be shuffled by generic approach. */
3936 static bool
3937 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3939 machine_mode sel_mode;
3941 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3942 if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3943 return false;
3945 /* Disable shuffle if we can't find an appropriate integer index mode for
3946 gather. */
3947 if (!get_gather_index_mode (d).exists (&sel_mode))
3948 return false;
3950 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3951 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
3952 rtx elt;
3954 bool is_simple = d->one_vector_p
3955 || const_vec_duplicate_p (sel, &elt)
3956 || (nunits.is_constant ()
3957 && const_vec_all_in_range_p (sel, 0, nunits - 1));
3959 if (!is_simple && !riscv_two_source_permutes)
3960 return false;
3962 /* Success! */
3963 if (d->testing_p)
3964 return true;
3966 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3967 instead of expand vec_perm<mode>, we handle it directly. */
3968 expand_vec_perm (d->target, d->op0, d->op1, sel);
3969 return true;
3972 /* This function recognizes and supports different permutation patterns
3973 and enable VLA SLP auto-vectorization. */
3974 static bool
3975 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3977 gcc_assert (d->op_mode != E_VOIDmode);
3979 /* The pattern matching functions above are written to look for a small
3980 number to begin the sequence (0, 1, N/2). If we begin with an index
3981 from the second operand, we can swap the operands. */
3982 poly_int64 nelt = d->perm.length ();
3983 if (known_ge (d->perm[0], nelt))
3985 d->perm.rotate_inputs (1);
3986 std::swap (d->op0, d->op1);
3989 if (known_gt (nelt, 1))
3991 if (d->vmode == d->op_mode)
3993 if (shuffle_merge_patterns (d))
3994 return true;
3995 if (shuffle_consecutive_patterns (d))
3996 return true;
3997 if (shuffle_slide_patterns (d))
3998 return true;
3999 if (shuffle_interleave_patterns (d))
4000 return true;
4001 if (shuffle_even_odd_patterns (d))
4002 return true;
4003 if (shuffle_compress_patterns (d))
4004 return true;
4005 if (shuffle_decompress_patterns (d))
4006 return true;
4007 if (shuffle_bswap_pattern (d))
4008 return true;
4009 if (shuffle_off_by_one_patterns (d))
4010 return true;
4011 if (shuffle_series_patterns (d))
4012 return true;
4013 if (shuffle_generic_patterns (d))
4014 return true;
4015 return false;
4017 else
4018 return false;
4020 return false;
4023 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
4024 * instructions. */
4025 bool
4026 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
4027 rtx op0, rtx op1, const vec_perm_indices &sel)
4029 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
4030 mask to do the iteration loop control. Just disable it directly. */
4031 if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
4032 return false;
4034 struct expand_vec_perm_d d;
4036 /* Check whether the mask can be applied to a single vector. */
4037 if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
4038 d.one_vector_p = true;
4039 else if (sel.all_from_input_p (0))
4041 d.one_vector_p = true;
4042 op1 = op0;
4044 else if (sel.all_from_input_p (1))
4046 d.one_vector_p = true;
4047 op0 = op1;
4049 else
4050 d.one_vector_p = false;
4052 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
4053 sel.nelts_per_input ());
4054 d.vmode = vmode;
4055 d.op_mode = op_mode;
4056 d.target = target;
4057 d.op0 = op0;
4058 if (op0 == op1)
4059 d.op1 = d.op0;
4060 else
4061 d.op1 = op1;
4062 d.testing_p = !target;
4064 if (!d.testing_p)
4065 return expand_vec_perm_const_1 (&d);
4067 rtx_insn *last = get_last_insn ();
4068 bool ret = expand_vec_perm_const_1 (&d);
4069 gcc_assert (last == get_last_insn ());
4071 return ret;
4074 /* Generate no side effects vsetvl to get the vector length. */
4075 void
4076 expand_select_vl (rtx *ops)
4078 poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
4079 if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
4081 /* If length is known <= VF, we just use the length directly instead
4082 of using vsetvli.
4084 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
4085 We move 3 into _255 instead of using explicit vsetvl. */
4086 emit_move_insn (ops[0], ops[1]);
4087 return;
4089 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
4090 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
4091 scalar_int_mode mode = QImode;
4092 machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
4093 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
4096 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
4097 static rtx
4098 get_else_operand (rtx op)
4100 return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
4103 /* Expand MASK_LEN_{LOAD,STORE}. */
4104 void
4105 expand_load_store (rtx *ops, bool is_load)
4107 int idx = 2;
4108 rtx mask = ops[idx++];
4109 /* A masked load has a merge/else operand. */
4110 if (is_load)
4111 get_else_operand (ops[idx++]);
4112 rtx len = ops[idx];
4113 machine_mode mode = GET_MODE (ops[0]);
4115 if (is_vlmax_len_p (mode, len))
4117 /* If the length operand is equal to VF, it is VLMAX load/store. */
4118 if (is_load)
4120 rtx m_ops[] = {ops[0], mask, ops[1]};
4121 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
4123 else
4125 len = gen_reg_rtx (Pmode);
4126 emit_vlmax_vsetvl (mode, len);
4127 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
4128 get_avl_type_rtx (VLMAX)));
4131 else
4133 if (!satisfies_constraint_K (len))
4134 len = force_reg (Pmode, len);
4135 if (is_load)
4137 rtx m_ops[] = {ops[0], mask, ops[1]};
4138 emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
4139 len);
4141 else
4142 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
4143 get_avl_type_rtx (NONVLMAX)));
4147 /* Expand MASK_LEN_STRIDED_LOAD. */
4148 void
4149 expand_strided_load (machine_mode mode, rtx *ops)
4151 rtx v_reg = ops[0];
4152 rtx base = ops[1];
4153 rtx stride = ops[2];
4154 rtx mask = ops[3];
4155 int idx = 4;
4156 get_else_operand (ops[idx++]);
4157 rtx len = ops[idx];
4158 poly_int64 len_val;
4160 insn_code icode = code_for_pred_strided_load (mode);
4161 rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
4163 if (poly_int_rtx_p (len, &len_val)
4164 && known_eq (len_val, GET_MODE_NUNITS (mode)))
4165 emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
4166 else
4168 len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
4169 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, emit_ops, len);
4173 /* Expand MASK_LEN_STRIDED_STORE. */
4174 void
4175 expand_strided_store (machine_mode mode, rtx *ops)
4177 rtx v_reg = ops[2];
4178 rtx base = ops[0];
4179 rtx stride = ops[1];
4180 rtx mask = ops[3];
4181 rtx len = ops[4];
4182 poly_int64 len_val;
4183 rtx vl_type;
4185 if (poly_int_rtx_p (len, &len_val)
4186 && known_eq (len_val, GET_MODE_NUNITS (mode)))
4188 len = gen_reg_rtx (Pmode);
4189 emit_vlmax_vsetvl (mode, len);
4190 vl_type = get_avl_type_rtx (VLMAX);
4192 else
4194 len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
4195 vl_type = get_avl_type_rtx (NONVLMAX);
4198 emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, base),
4199 mask, stride, v_reg, len, vl_type));
4202 /* Return true if the operation is the floating-point operation need FRM. */
4203 static bool
4204 needs_fp_rounding (unsigned icode, machine_mode mode)
4206 if (!FLOAT_MODE_P (mode))
4207 return false;
4209 return icode != maybe_code_for_pred (SMIN, mode)
4210 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
4211 && icode != maybe_code_for_pred (SMAX, mode)
4212 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
4213 && icode != maybe_code_for_pred (NEG, mode)
4214 && icode != maybe_code_for_pred (ABS, mode)
4215 /* narrower-FP -> FP */
4216 && icode != maybe_code_for_pred_extend (mode)
4217 /* narrower-INT -> FP */
4218 && icode != maybe_code_for_pred_widen (FLOAT, mode)
4219 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
4220 /* vfsgnj */
4221 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
4222 && icode != maybe_code_for_pred_mov (mode);
4225 /* Subroutine to expand COND_LEN_* patterns. */
4226 static void
4227 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
4229 rtx dest = ops[0];
4230 rtx mask = ops[1];
4231 machine_mode mode = GET_MODE (dest);
4232 machine_mode mask_mode = GET_MODE (mask);
4233 bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
4234 bool is_vlmax_len = is_vlmax_len_p (mode, len);
4236 unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
4237 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
4238 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
4239 simplification in RISC-V backend and may do that in middle-end in the
4240 future. */
4241 if (is_dummy_mask && is_vlmax_len)
4242 insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
4243 else if (is_dummy_mask)
4244 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
4245 else if (is_vlmax_len)
4246 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
4247 else
4248 insn_flags |= TU_POLICY_P | MU_POLICY_P;
4250 if (needs_fp_rounding (icode, mode))
4251 insn_flags |= FRM_DYN_P;
4253 if (is_vlmax_len)
4254 emit_vlmax_insn (icode, insn_flags, ops);
4255 else
4256 emit_nonvlmax_insn (icode, insn_flags, ops, len);
4259 /* Expand unary ops COND_LEN_*. */
4260 void
4261 expand_cond_len_unop (unsigned icode, rtx *ops)
4263 rtx dest = ops[0];
4264 rtx mask = ops[1];
4265 rtx src = ops[2];
4266 rtx merge = get_else_operand (ops[3]);
4267 rtx len = ops[4];
4269 rtx cond_ops[] = {dest, mask, merge, src};
4270 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
4273 /* Expand unary ops COND_*. */
4274 void
4275 expand_cond_unop (unsigned icode, rtx *ops)
4277 rtx dest = ops[0];
4278 rtx mask = ops[1];
4279 rtx src = ops[2];
4280 rtx merge = get_else_operand (ops[3]);
4281 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4283 rtx cond_ops[] = {dest, mask, merge, src};
4284 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
4287 /* Expand binary ops COND_LEN_*. */
4288 void
4289 expand_cond_len_binop (unsigned icode, rtx *ops)
4291 rtx dest = ops[0];
4292 rtx mask = ops[1];
4293 rtx src1 = ops[2];
4294 rtx src2 = ops[3];
4295 rtx merge = get_else_operand (ops[4]);
4296 rtx len = ops[5];
4298 rtx cond_ops[] = {dest, mask, merge, src1, src2};
4299 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4302 /* Expand binary ops COND_*. */
4303 void
4304 expand_cond_binop (unsigned icode, rtx *ops)
4306 rtx dest = ops[0];
4307 rtx mask = ops[1];
4308 rtx src1 = ops[2];
4309 rtx src2 = ops[3];
4310 rtx merge = get_else_operand (ops[4]);
4311 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4313 rtx cond_ops[] = {dest, mask, merge, src1, src2};
4314 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4317 /* Prepare insn_code for gather_load/scatter_store according to
4318 the vector mode and index mode. */
4319 static insn_code
4320 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
4321 bool is_load)
4323 if (!is_load)
4324 return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
4325 else
4327 unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
4328 unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
4329 if (dst_eew_bitsize == src_eew_bitsize)
4330 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
4331 else if (dst_eew_bitsize > src_eew_bitsize)
4333 unsigned factor = dst_eew_bitsize / src_eew_bitsize;
4334 switch (factor)
4336 case 2:
4337 return code_for_pred_indexed_load_x2_greater_eew (
4338 UNSPEC_UNORDERED, vec_mode);
4339 case 4:
4340 return code_for_pred_indexed_load_x4_greater_eew (
4341 UNSPEC_UNORDERED, vec_mode);
4342 case 8:
4343 return code_for_pred_indexed_load_x8_greater_eew (
4344 UNSPEC_UNORDERED, vec_mode);
4345 default:
4346 gcc_unreachable ();
4349 else
4351 unsigned factor = src_eew_bitsize / dst_eew_bitsize;
4352 switch (factor)
4354 case 2:
4355 return code_for_pred_indexed_load_x2_smaller_eew (
4356 UNSPEC_UNORDERED, vec_mode);
4357 case 4:
4358 return code_for_pred_indexed_load_x4_smaller_eew (
4359 UNSPEC_UNORDERED, vec_mode);
4360 case 8:
4361 return code_for_pred_indexed_load_x8_smaller_eew (
4362 UNSPEC_UNORDERED, vec_mode);
4363 default:
4364 gcc_unreachable ();
4370 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
4371 void
4372 expand_gather_scatter (rtx *ops, bool is_load)
4374 rtx ptr, vec_offset, vec_reg;
4375 bool zero_extend_p;
4376 int shift;
4377 rtx mask = ops[5];
4378 rtx len = ops[6];
4379 if (is_load)
4380 len = ops[7];
4381 if (is_load)
4383 vec_reg = ops[0];
4384 ptr = ops[1];
4385 vec_offset = ops[2];
4386 zero_extend_p = INTVAL (ops[3]);
4387 shift = exact_log2 (INTVAL (ops[4]));
4389 else
4391 vec_reg = ops[4];
4392 ptr = ops[0];
4393 vec_offset = ops[1];
4394 zero_extend_p = INTVAL (ops[2]);
4395 shift = exact_log2 (INTVAL (ops[3]));
4398 machine_mode vec_mode = GET_MODE (vec_reg);
4399 machine_mode idx_mode = GET_MODE (vec_offset);
4400 scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4401 unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4402 poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4403 bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4405 bool use_widening_shift = false;
4407 /* Extend the offset element to address width. */
4408 if (inner_offsize < BITS_PER_WORD)
4410 use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
4411 /* 7.2. Vector Load/Store Addressing Modes.
4412 If the vector offset elements are narrower than XLEN, they are
4413 zero-extended to XLEN before adding to the ptr effective address. If
4414 the vector offset elements are wider than XLEN, the least-significant
4415 XLEN bits are used in the address calculation. An implementation must
4416 raise an illegal instruction exception if the EEW is not supported for
4417 offset elements.
4419 RVV spec only refers to the shift == 0 case. */
4420 if (!zero_extend_p || shift)
4422 if (zero_extend_p)
4423 inner_idx_mode
4424 = int_mode_for_size (inner_offsize * 2, 0).require ();
4425 else
4426 inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4427 machine_mode new_idx_mode
4428 = get_vector_mode (inner_idx_mode, nunits).require ();
4429 if (!use_widening_shift)
4431 rtx tmp = gen_reg_rtx (new_idx_mode);
4432 emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4433 zero_extend_p ? true : false));
4434 vec_offset = tmp;
4436 idx_mode = new_idx_mode;
4440 if (shift)
4442 rtx tmp;
4443 if (!use_widening_shift)
4444 tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4445 gen_int_mode (shift, Pmode), NULL_RTX, 0,
4446 OPTAB_DIRECT);
4447 else
4449 tmp = gen_reg_rtx (idx_mode);
4450 insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
4451 rtx ops[] = {tmp, vec_offset, const1_rtx};
4452 emit_vlmax_insn (icode, BINARY_OP, ops);
4455 vec_offset = tmp;
4458 insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4459 if (is_vlmax)
4461 if (is_load)
4463 rtx load_ops[]
4464 = {vec_reg, mask, ptr, vec_offset};
4465 emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4467 else
4469 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4470 emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4473 else
4475 if (is_load)
4477 rtx load_ops[]
4478 = {vec_reg, mask, ptr, vec_offset};
4479 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4481 else
4483 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4484 emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4489 /* Expand COND_LEN_*. */
4490 void
4491 expand_cond_len_ternop (unsigned icode, rtx *ops)
4493 rtx dest = ops[0];
4494 rtx mask = ops[1];
4495 rtx src1 = ops[2];
4496 rtx src2 = ops[3];
4497 rtx src3 = ops[4];
4498 rtx merge = get_else_operand (ops[5]);
4499 rtx len = ops[6];
4501 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4502 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4505 /* Expand COND_*. */
4506 void
4507 expand_cond_ternop (unsigned icode, rtx *ops)
4509 rtx dest = ops[0];
4510 rtx mask = ops[1];
4511 rtx src1 = ops[2];
4512 rtx src2 = ops[3];
4513 rtx src3 = ops[4];
4514 rtx merge = get_else_operand (ops[5]);
4515 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4517 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4518 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4521 /* Expand reduction operations.
4522 Case 1: ops = {scalar_dest, vector_src}
4523 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4525 void
4526 expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
4527 unsigned insn_flags, rtx *ops, rtx init)
4529 rtx scalar_dest = ops[0];
4530 rtx vector_src = ops[1];
4531 machine_mode vmode = GET_MODE (vector_src);
4532 machine_mode vel_mode = GET_MODE (scalar_dest);
4533 machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4534 rtx vl_op = NULL_RTX;
4535 bool need_vl0_safe = false;
4536 if (need_mask_operand_p (insn_flags))
4538 vl_op = ops[3];
4539 need_vl0_safe = !CONST_INT_P (vl_op) && !CONST_POLY_INT_P (vl_op);
4542 rtx m1_tmp = gen_reg_rtx (m1_mode);
4543 rtx scalar_move_ops[] = {m1_tmp, init};
4544 insn_code icode = code_for_pred_broadcast (m1_mode);
4545 if (need_mask_operand_p (insn_flags))
4547 if (need_vl0_safe)
4548 emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
4549 else
4550 emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
4552 else
4553 emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
4555 rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4556 rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4558 if (need_vl0_safe)
4559 icode = code_for_pred (unspec_for_vl0_safe, vmode);
4560 else
4561 icode = code_for_pred (unspec, vmode);
4563 if (need_mask_operand_p (insn_flags))
4565 rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4566 emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, vl_op);
4568 else
4569 emit_vlmax_insn (icode, insn_flags, reduc_ops);
4571 emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4574 /* Prepare ops for ternary operations.
4575 It can be called before or after RA. */
4576 void
4577 prepare_ternary_operands (rtx *ops)
4579 machine_mode mode = GET_MODE (ops[0]);
4581 if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4582 && (VECTOR_MODE_P (GET_MODE (ops[2]))
4583 && !rtx_equal_p (ops[2], ops[5]))
4584 && !rtx_equal_p (ops[3], ops[5])
4585 && !rtx_equal_p (ops[4], ops[5]))
4587 /* RA will fail to find vector REG and report ICE, so we pre-merge
4588 the ops for LMUL = 8. */
4589 if (satisfies_constraint_Wc1 (ops[1]))
4591 emit_move_insn (ops[0], ops[5]);
4592 emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4593 ops[7], ops[8], ops[9]));
4595 else
4596 emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4597 ops[4], ops[1], ops[6], ops[7], ops[9]));
4598 ops[5] = ops[4] = ops[0];
4600 else
4602 /* Swap the multiplication ops if the fallback value is the
4603 second of the two. */
4604 if (rtx_equal_p (ops[3], ops[5]))
4605 std::swap (ops[2], ops[3]);
4607 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4608 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4610 gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4611 || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4614 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4615 void
4616 expand_lanes_load_store (rtx *ops, bool is_load)
4618 rtx mask = ops[2];
4619 rtx len = ops[3];
4620 if (is_load)
4621 len = ops[4];
4622 rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4623 rtx reg = is_load ? ops[0] : ops[1];
4624 machine_mode mode = GET_MODE (ops[0]);
4626 if (is_vlmax_len_p (mode, len))
4628 /* If the length operand is equal to VF, it is VLMAX load/store. */
4629 if (is_load)
4631 rtx m_ops[] = {reg, mask, addr};
4632 emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4633 m_ops);
4635 else
4637 len = gen_reg_rtx (Pmode);
4638 emit_vlmax_vsetvl (mode, len);
4639 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4640 get_avl_type_rtx (VLMAX)));
4643 else
4645 if (!satisfies_constraint_K (len))
4646 len = force_reg (Pmode, len);
4647 if (is_load)
4649 rtx m_ops[] = {reg, mask, addr};
4650 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4651 UNARY_OP_TAMA, m_ops, len);
4653 else
4654 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4655 get_avl_type_rtx (NONVLMAX)));
4659 /* Expand LEN_FOLD_EXTRACT_LAST. */
4660 void
4661 expand_fold_extract_last (rtx *ops)
4663 rtx dst = ops[0];
4664 rtx default_value = ops[1];
4665 rtx mask = ops[2];
4666 rtx anchor = gen_reg_rtx (Pmode);
4667 rtx index = gen_reg_rtx (Pmode);
4668 rtx vect = ops[3];
4669 rtx else_label = gen_label_rtx ();
4670 rtx end_label = gen_label_rtx ();
4671 rtx len = ops[4];
4672 machine_mode mode = GET_MODE (vect);
4673 machine_mode mask_mode = GET_MODE (mask);
4674 rtx compress_vect = gen_reg_rtx (mode);
4675 rtx slide_vect = gen_reg_rtx (mode);
4676 insn_code icode;
4678 if (is_vlmax_len_p (mode, len))
4679 len = NULL_RTX;
4681 /* Calculate the number of 1-bit in mask. */
4682 rtx cpop_ops[] = {anchor, mask};
4683 if (len)
4684 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4685 cpop_ops, len);
4686 else
4687 emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4688 cpop_ops);
4690 riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4691 emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4692 /* Compress the vector. */
4693 icode = code_for_pred_compress (mode);
4694 rtx compress_ops[] = {compress_vect, vect, mask};
4695 if (len)
4696 emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4697 else
4698 emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4699 /* Emit the slide down to index 0 in a new vector. */
4700 rtx slide_ops[] = {slide_vect, compress_vect, index};
4701 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4702 if (len)
4703 emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4704 else
4705 emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4706 /* Emit v(f)mv.[xf].s. */
4707 emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4709 emit_jump_insn (gen_jump (end_label));
4710 emit_barrier ();
4711 emit_label (else_label);
4712 emit_move_insn (dst, default_value);
4713 emit_label (end_label);
4716 /* Return true if the LMUL of comparison less than or equal to one. */
4717 bool
4718 cmp_lmul_le_one (machine_mode mode)
4720 if (riscv_v_ext_vector_mode_p (mode))
4721 return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4722 else if (riscv_v_ext_vls_mode_p (mode))
4723 return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4724 return false;
4727 /* Return true if the LMUL of comparison greater than one. */
4728 bool
4729 cmp_lmul_gt_one (machine_mode mode)
4731 if (riscv_v_ext_vector_mode_p (mode))
4732 return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4733 else if (riscv_v_ext_vls_mode_p (mode))
4734 return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4735 return false;
4738 /* Return true if the VLS mode is legal. There are 2 cases here.
4740 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4741 is the highest priority choice and should not conflict with VLS modes.
4742 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4743 VLS mode are smaller than the minimal vla.
4745 Take vlen = 2048 as example for case 2.
4747 Note: Below table based on vlen = 2048.
4748 +----------------------------------------------------+----------------------+
4749 | VLS mode | VLA mode |
4750 +----------------------------------------------------+----------------------+
4751 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4752 +------------+-----------+-----------------+---------+-----------+----------+
4753 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4754 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4755 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4756 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4757 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4758 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4759 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4760 | ... | ... | ... | ... | RVVMF64BI | 32 |
4761 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4762 +------------+-----------+-----------------+---------+-----------+----------+
4763 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4764 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4765 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4766 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4767 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4768 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4769 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4770 | ... | ... | .. | ... | RVVMF8QI | 256 |
4771 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4772 +------------+-----------+-----------------+---------+-----------+----------+
4773 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4774 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4775 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4776 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4777 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4778 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4779 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4780 | ... | ... | .. | ... | RVVMF4HI | 512 |
4781 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4782 +------------+-----------+-----------------+---------+-----------+----------+
4783 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4784 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4785 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4786 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4787 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4788 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4789 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4790 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4791 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4792 +------------+-----------+-----------------+---------+-----------+----------+
4793 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4794 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4795 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4796 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4797 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4798 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4799 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4800 | ... | ... | .. | ... | RVVM1DI | 2048 |
4801 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4802 +------------+-----------+-----------------+---------+-----------+----------+
4804 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4805 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4806 bool
4807 vls_mode_valid_p (machine_mode vls_mode)
4809 if (!TARGET_VECTOR || TARGET_XTHEADVECTOR)
4810 return false;
4812 if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE)
4814 if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4815 && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4816 GET_MODE_PRECISION (vls_mode)))
4817 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4818 BITS_PER_RISCV_VECTOR.
4820 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4821 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4822 false between VLA modes with size = (128, 128) bits and VLS mode
4823 with size = 128 bits, we will end up with multiple ICEs in
4824 middle-end generic codes. */
4825 return false;
4826 return true;
4829 if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
4831 machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4832 int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4833 int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4835 return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4838 return false;
4841 /* We don't have to convert the floating point to integer when the
4842 mantissa is zero. Thus, ther will be a limitation for both the
4843 single and double precision floating point. There will be no
4844 mantissa if the floating point is greater than the limit.
4846 1. Half floating point.
4847 +-----------+---------------+
4848 | float | binary layout |
4849 +-----------+---------------+
4850 | 1023.5 | 0x63ff |
4851 +-----------+---------------+
4852 | 1024.0 | 0x6400 |
4853 +-----------+---------------+
4854 | 1025.0 | 0x6401 |
4855 +-----------+---------------+
4856 | ... | ... |
4858 All half floating point will be unchanged for ceil if it is
4859 greater than and equal to 1024.
4861 2. Single floating point.
4862 +-----------+---------------+
4863 | float | binary layout |
4864 +-----------+---------------+
4865 | 8388607.5 | 0x4affffff |
4866 +-----------+---------------+
4867 | 8388608.0 | 0x4b000000 |
4868 +-----------+---------------+
4869 | 8388609.0 | 0x4b000001 |
4870 +-----------+---------------+
4871 | ... | ... |
4873 All single floating point will be unchanged for ceil if it is
4874 greater than and equal to 8388608.
4876 3. Double floating point.
4877 +--------------------+--------------------+
4878 | float | binary layout |
4879 +--------------------+--------------------+
4880 | 4503599627370495.5 | 0X432fffffffffffff |
4881 +--------------------+--------------------+
4882 | 4503599627370496.0 | 0X4330000000000000 |
4883 +--------------------+--------------------+
4884 | 4503599627370497.0 | 0X4340000000000000 |
4885 +--------------------+--------------------+
4886 | ... | ... |
4888 All double floating point will be unchanged for ceil if it is
4889 greater than and equal to 4503599627370496.
4892 get_fp_rounding_coefficient (machine_mode inner_mode)
4894 REAL_VALUE_TYPE real;
4896 if (inner_mode == E_HFmode)
4897 real_from_integer (&real, inner_mode, 1024, SIGNED);
4898 else if (inner_mode == E_SFmode)
4899 real_from_integer (&real, inner_mode, 8388608, SIGNED);
4900 else if (inner_mode == E_DFmode)
4901 real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4902 else
4903 gcc_unreachable ();
4905 return const_double_from_real_value (real, inner_mode);
4908 static rtx
4909 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4910 machine_mode vec_fp_mode)
4912 /* Step-1: Prepare the scalar float compare register. */
4913 rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4914 emit_insn (gen_move_insn (fp_reg, fp_scalar));
4916 /* Step-2: Generate the mask. */
4917 machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4918 rtx mask = gen_reg_rtx (mask_mode);
4919 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4920 rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4921 insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4922 emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4924 return mask;
4927 static void
4928 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4929 machine_mode vec_mode)
4931 rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4932 insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4934 emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4937 static void
4938 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4940 rtx abs_ops[] = {op_dest, op_src};
4941 insn_code icode = code_for_pred (ABS, vec_mode);
4943 emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4946 static void
4947 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4948 insn_type type, machine_mode vec_mode)
4950 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4952 if (type & USE_VUNDEF_MERGE_P)
4954 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4955 emit_vlmax_insn (icode, type, cvt_x_ops);
4957 else
4959 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4960 emit_vlmax_insn (icode, type, cvt_x_ops);
4964 static void
4965 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4966 machine_mode vec_mode)
4968 rtx ops[] = {op_dest, op_src};
4969 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4971 emit_vlmax_insn (icode, type, ops);
4974 static void
4975 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4976 machine_mode vec_mode)
4978 rtx ops[] = {op_dest, op_src};
4979 insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4981 emit_vlmax_insn (icode, type, ops);
4984 static void
4985 emit_vec_widen_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4986 machine_mode vec_mode)
4988 rtx ops[] = {op_dest, op_src};
4989 insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4991 emit_vlmax_insn (icode, type, ops);
4994 static void
4995 emit_vec_widen_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4996 machine_mode vec_mode)
4998 rtx ops[] = {op_dest, op_src};
4999 insn_code icode = code_for_pred_extend (vec_mode);
5001 emit_vlmax_insn (icode, type, ops);
5004 static void
5005 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
5006 insn_type type, machine_mode vec_mode)
5008 rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
5009 insn_code icode = code_for_pred (FLOAT, vec_mode);
5011 emit_vlmax_insn (icode, type, cvt_fp_ops);
5014 static void
5015 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
5016 insn_type type, machine_mode vec_mode)
5018 insn_code icode = code_for_pred (FIX, vec_mode);
5020 if (type & USE_VUNDEF_MERGE_P)
5022 rtx cvt_x_ops[] = {op_dest, mask, op_src};
5023 emit_vlmax_insn (icode, type, cvt_x_ops);
5025 else
5027 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
5028 emit_vlmax_insn (icode, type, cvt_x_ops);
5032 static void
5033 emit_vec_binary_alu (rtx op_dest, rtx op_1, rtx op_2, enum rtx_code rcode,
5034 machine_mode vec_mode)
5036 rtx ops[] = {op_dest, op_1, op_2};
5037 insn_code icode = code_for_pred (rcode, vec_mode);
5039 emit_vlmax_insn (icode, BINARY_OP, ops);
5042 void
5043 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5044 machine_mode vec_int_mode)
5046 /* Step-1: Get the abs float value for mask generation. */
5047 emit_vec_abs (op_0, op_1, vec_fp_mode);
5049 /* Step-2: Generate the mask on const fp. */
5050 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5051 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5053 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
5054 rtx tmp = gen_reg_rtx (vec_int_mode);
5055 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
5057 /* Step-4: Convert to floating-point on mask for the final result.
5058 To avoid unnecessary frm register access, we use RUP here and it will
5059 never do the rounding up because the tmp rtx comes from the float
5060 to int conversion. */
5061 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
5063 /* Step-5: Retrieve the sign bit for -0.0. */
5064 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5067 void
5068 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5069 machine_mode vec_int_mode)
5071 /* Step-1: Get the abs float value for mask generation. */
5072 emit_vec_abs (op_0, op_1, vec_fp_mode);
5074 /* Step-2: Generate the mask on const fp. */
5075 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5076 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5078 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
5079 rtx tmp = gen_reg_rtx (vec_int_mode);
5080 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
5082 /* Step-4: Convert to floating-point on mask for the floor result. */
5083 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
5085 /* Step-5: Retrieve the sign bit for -0.0. */
5086 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5089 void
5090 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5091 machine_mode vec_int_mode)
5093 /* Step-1: Get the abs float value for mask generation. */
5094 emit_vec_abs (op_0, op_1, vec_fp_mode);
5096 /* Step-2: Generate the mask on const fp. */
5097 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5098 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5100 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
5101 rtx fflags = gen_reg_rtx (SImode);
5102 emit_insn (gen_riscv_frflags (fflags));
5104 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
5105 rtx tmp = gen_reg_rtx (vec_int_mode);
5106 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
5108 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
5109 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5111 /* Step-6: Restore FP exception flags. */
5112 emit_insn (gen_riscv_fsflags (fflags));
5114 /* Step-7: Retrieve the sign bit for -0.0. */
5115 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5118 void
5119 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5120 machine_mode vec_int_mode)
5122 /* Step-1: Get the abs float value for mask generation. */
5123 emit_vec_abs (op_0, op_1, vec_fp_mode);
5125 /* Step-2: Generate the mask on const fp. */
5126 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5127 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5129 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
5130 rtx tmp = gen_reg_rtx (vec_int_mode);
5131 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
5133 /* Step-4: Convert to floating-point on mask for the rint result. */
5134 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5136 /* Step-5: Retrieve the sign bit for -0.0. */
5137 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5140 void
5141 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5142 machine_mode vec_int_mode)
5144 /* Step-1: Get the abs float value for mask generation. */
5145 emit_vec_abs (op_0, op_1, vec_fp_mode);
5147 /* Step-2: Generate the mask on const fp. */
5148 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5149 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5151 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
5152 rtx tmp = gen_reg_rtx (vec_int_mode);
5153 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
5155 /* Step-4: Convert to floating-point on mask for the round result. */
5156 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
5158 /* Step-5: Retrieve the sign bit for -0.0. */
5159 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5162 void
5163 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5164 machine_mode vec_int_mode)
5166 /* Step-1: Get the abs float value for mask generation. */
5167 emit_vec_abs (op_0, op_1, vec_fp_mode);
5169 /* Step-2: Generate the mask on const fp. */
5170 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5171 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5173 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
5174 rtx tmp = gen_reg_rtx (vec_int_mode);
5175 emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
5177 /* Step-4: Convert to floating-point on mask for the rint result. */
5178 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
5180 /* Step-5: Retrieve the sign bit for -0.0. */
5181 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5184 void
5185 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5186 machine_mode vec_int_mode)
5188 /* Step-1: Get the abs float value for mask generation. */
5189 emit_vec_abs (op_0, op_1, vec_fp_mode);
5191 /* Step-2: Generate the mask on const fp. */
5192 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
5193 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
5195 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
5196 rtx tmp = gen_reg_rtx (vec_int_mode);
5197 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
5199 /* Step-4: Convert to floating-point on mask for the rint result. */
5200 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
5202 /* Step-5: Retrieve the sign bit for -0.0. */
5203 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
5206 /* Handling the rounding from floating-point to int/long/long long. */
5207 static void
5208 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
5209 machine_mode vec_fp_mode,
5210 machine_mode vec_int_mode,
5211 machine_mode vec_bridge_mode = E_VOIDmode)
5213 poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
5214 poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
5216 if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI. */
5217 emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
5218 else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI. */
5219 emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
5220 else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI. */
5221 emit_vec_widen_cvt_x_f (op_0, op_1, type, vec_int_mode);
5222 else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI. */
5224 gcc_assert (vec_bridge_mode != E_VOIDmode);
5226 rtx op_sf = gen_reg_rtx (vec_bridge_mode);
5228 /* Step-1: HF => SF, no rounding here. */
5229 emit_vec_widen_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
5230 /* Step-2: SF => DI. */
5231 emit_vec_widen_cvt_x_f (op_0, op_sf, type, vec_int_mode);
5233 else
5234 gcc_unreachable ();
5237 void
5238 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5239 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
5241 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
5242 vec_int_mode, vec_bridge_mode);
5245 void
5246 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5247 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
5249 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
5250 vec_int_mode, vec_bridge_mode);
5253 void
5254 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5255 machine_mode vec_int_mode)
5257 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
5258 vec_int_mode);
5261 void
5262 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
5263 machine_mode vec_int_mode)
5265 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
5266 vec_int_mode);
5269 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
5270 the vector fixed point vector single-width saturating add directly. */
5272 void
5273 expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5275 emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode);
5278 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
5279 the vector fixed point vector single-width saturating add directly. */
5281 void
5282 expand_vec_ssadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5284 emit_vec_binary_alu (op_0, op_1, op_2, SS_PLUS, vec_mode);
5287 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
5288 the vector fixed point vector single-width saturating add directly. */
5290 void
5291 expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5293 emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
5296 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
5297 the vector fixed point vector single-width saturating add directly. */
5299 void
5300 expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
5302 emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode);
5305 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5306 DI => SI. we can leverage the vector fixed point vector narrowing
5307 fixed-point clip directly. */
5309 void
5310 expand_vec_double_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5312 insn_code icode;
5313 rtx zero = CONST0_RTX (Xmode);
5314 enum unspec unspec = UNSPEC_VNCLIPU;
5315 rtx ops[] = {op_0, op_1, zero};
5317 icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5318 emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5321 /* Expand the standard name sstrunc<m><n>2 for double vector mode, like
5322 DI => SI. we can leverage the vector fixed point vector narrowing
5323 fixed-point clip directly. */
5325 void
5326 expand_vec_double_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5328 insn_code icode;
5329 rtx zero = CONST0_RTX (Xmode);
5330 enum unspec unspec = UNSPEC_VNCLIP;
5331 rtx ops[] = {op_0, op_1, zero};
5333 icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5334 emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5337 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5338 DI => HI. we can leverage the vector fixed point vector narrowing
5339 fixed-point clip directly. */
5341 void
5342 expand_vec_quad_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5343 machine_mode double_mode)
5345 rtx double_rtx = gen_reg_rtx (double_mode);
5347 expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5348 expand_vec_double_ustrunc (op_0, double_rtx, double_mode);
5351 /* Expand the standard name sstrunc<m><n>2 for quad vector mode, like
5352 DI => HI. we can leverage the vector fixed point vector narrowing
5353 fixed-point clip directly. */
5355 void
5356 expand_vec_quad_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5357 machine_mode double_mode)
5359 rtx double_rtx = gen_reg_rtx (double_mode);
5361 expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5362 expand_vec_double_sstrunc (op_0, double_rtx, double_mode);
5365 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5366 DI => QI. we can leverage the vector fixed point vector narrowing
5367 fixed-point clip directly. */
5369 void
5370 expand_vec_oct_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5371 machine_mode double_mode, machine_mode quad_mode)
5373 rtx double_rtx = gen_reg_rtx (double_mode);
5374 rtx quad_rtx = gen_reg_rtx (quad_mode);
5376 expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5377 expand_vec_double_ustrunc (quad_rtx, double_rtx, double_mode);
5378 expand_vec_double_ustrunc (op_0, quad_rtx, quad_mode);
5381 /* Expand the standard name sstrunc<m><n>2 for oct vector mode, like
5382 DI => QI. we can leverage the vector fixed point vector narrowing
5383 fixed-point clip directly. */
5385 void
5386 expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5387 machine_mode double_mode, machine_mode quad_mode)
5389 rtx double_rtx = gen_reg_rtx (double_mode);
5390 rtx quad_rtx = gen_reg_rtx (quad_mode);
5392 expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5393 expand_vec_double_sstrunc (quad_rtx, double_rtx, double_mode);
5394 expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
5397 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5398 well. */
5399 void
5400 expand_popcount (rtx *ops)
5402 rtx dst = ops[0];
5403 rtx src = ops[1];
5404 machine_mode mode = GET_MODE (dst);
5405 scalar_mode imode = GET_MODE_INNER (mode);
5406 static const uint64_t m5 = 0x5555555555555555ULL;
5407 static const uint64_t m3 = 0x3333333333333333ULL;
5408 static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
5409 static const uint64_t m1 = 0x0101010101010101ULL;
5411 rtx x1 = gen_reg_rtx (mode);
5412 rtx x2 = gen_reg_rtx (mode);
5413 rtx x3 = gen_reg_rtx (mode);
5414 rtx x4 = gen_reg_rtx (mode);
5416 /* x1 = src - (src >> 1) & 0x555...); */
5417 rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
5418 OPTAB_DIRECT);
5420 rtx and1 = gen_reg_rtx (mode);
5421 rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
5422 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5423 ops1);
5425 x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
5427 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5429 rtx and2 = gen_reg_rtx (mode);
5430 rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
5431 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5432 ops2);
5434 rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
5435 OPTAB_DIRECT);
5437 rtx and22 = gen_reg_rtx (mode);
5438 rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
5439 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5440 ops22);
5442 x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
5444 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
5445 rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
5446 OPTAB_DIRECT);
5448 rtx plus3
5449 = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
5451 rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
5452 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5453 ops3);
5455 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
5456 rtx mul4 = gen_reg_rtx (mode);
5457 rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
5458 emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
5459 ops4);
5461 x4 = expand_binop (mode, lshr_optab, mul4,
5462 GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
5463 OPTAB_DIRECT);
5465 emit_move_insn (dst, x4);
5468 /* Return true if it is VLMAX AVL TYPE. */
5469 bool
5470 vlmax_avl_type_p (rtx_insn *rinsn)
5472 extract_insn_cached (rinsn);
5473 int index = get_attr_avl_type_idx (rinsn);
5474 if (index == INVALID_ATTRIBUTE)
5475 return false;
5477 gcc_assert (index < recog_data.n_operands);
5479 rtx avl_type = recog_data.operand[index];
5480 return INTVAL (avl_type) == VLMAX;
5483 /* Return true if it is an RVV instruction depends on VL global
5484 status register. */
5485 bool
5486 has_vl_op (rtx_insn *rinsn)
5488 return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
5491 /* Get default tail policy. */
5492 static bool
5493 get_default_ta ()
5495 /* For the instruction that doesn't require TA, we still need a default value
5496 to emit vsetvl. We pick up the default value according to prefer policy. */
5497 return (bool) (get_prefer_tail_policy () & 0x1
5498 || (get_prefer_tail_policy () >> 1 & 0x1));
5501 /* Helper function to get TA operand. */
5502 bool
5503 tail_agnostic_p (rtx_insn *rinsn)
5505 /* If it doesn't have TA, we return agnostic by default. */
5506 extract_insn_cached (rinsn);
5507 int ta = get_attr_ta (rinsn);
5508 return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
5511 /* Change insn and Assert the change always happens. */
5512 void
5513 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
5515 bool change_p = validate_change (object, loc, new_rtx, in_group);
5516 gcc_assert (change_p);
5519 /* Return true if it is NONVLMAX AVL TYPE. */
5520 bool
5521 nonvlmax_avl_type_p (rtx_insn *rinsn)
5523 extract_insn_cached (rinsn);
5524 int index = get_attr_avl_type_idx (rinsn);
5525 if (index == INVALID_ATTRIBUTE)
5526 return false;
5528 gcc_assert (index < recog_data.n_operands);
5530 rtx avl_type = recog_data.operand[index];
5531 return INTVAL (avl_type) == NONVLMAX;
5534 /* Return true if RTX is RVV VLMAX AVL. */
5535 bool
5536 vlmax_avl_p (rtx x)
5538 return x && rtx_equal_p (x, RVV_VLMAX);
5541 /* Helper function to get SEW operand. We always have SEW value for
5542 all RVV instructions that have VTYPE OP. */
5543 uint8_t
5544 get_sew (rtx_insn *rinsn)
5546 return get_attr_sew (rinsn);
5549 /* Helper function to get VLMUL operand. We always have VLMUL value for
5550 all RVV instructions that have VTYPE OP. */
5551 enum vlmul_type
5552 get_vlmul (rtx_insn *rinsn)
5554 return (enum vlmul_type) get_attr_vlmul (rinsn);
5557 /* Count the number of REGNO in RINSN. */
5559 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5561 int count = 0;
5562 extract_insn (rinsn);
5563 for (int i = 0; i < recog_data.n_operands; i++)
5564 if (refers_to_regno_p (regno, recog_data.operand[i]))
5565 count++;
5566 return count;
5569 /* Return true if the OP can be directly broadcasted. */
5570 bool
5571 can_be_broadcasted_p (rtx op)
5573 machine_mode mode = GET_MODE (op);
5574 /* We don't allow RA (register allocation) reload generate
5575 (vec_duplicate:DI reg) in RV32 system wheras we allow
5576 (vec_duplicate:DI mem) in RV32 system. */
5577 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5578 && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5579 && !satisfies_constraint_Wdm (op))
5580 return false;
5582 if (satisfies_constraint_K (op) || register_operand (op, mode)
5583 || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5584 return true;
5586 return can_create_pseudo_p () && nonmemory_operand (op, mode);
5589 void
5590 emit_vec_extract (rtx target, rtx src, rtx index)
5592 machine_mode vmode = GET_MODE (src);
5593 machine_mode smode = GET_MODE (target);
5594 class expand_operand ops[3];
5595 enum insn_code icode
5596 = convert_optab_handler (vec_extract_optab, vmode, smode);
5597 gcc_assert (icode != CODE_FOR_nothing);
5598 create_output_operand (&ops[0], target, smode);
5599 ops[0].target = 1;
5600 create_input_operand (&ops[1], src, vmode);
5602 poly_int64 val;
5603 if (poly_int_rtx_p (index, &val))
5604 create_integer_operand (&ops[2], val);
5605 else
5606 create_input_operand (&ops[2], index, Pmode);
5608 expand_insn (icode, 3, ops);
5609 if (ops[0].value != target)
5610 emit_move_insn (target, ops[0].value);
5613 /* Return true if the offset mode is valid mode that we use for gather/scatter
5614 autovectorization. */
5615 bool
5616 gather_scatter_valid_offset_p (machine_mode mode)
5618 /* If the element size of offset mode is already >= Pmode size,
5619 we don't need any extensions. */
5620 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5621 return true;
5623 /* Since we are very likely extend the offset mode into vector Pmode,
5624 Disable gather/scatter autovectorization if we can't extend the offset
5625 mode into vector Pmode. */
5626 if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5627 return false;
5628 return true;
5631 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5632 Look into the tuning structure for an estimate.
5633 KIND specifies the type of requested estimate: min, max or likely.
5634 For cores with a known VLA width all three estimates are the same.
5635 For generic VLA tuning we want to distinguish the maximum estimate from
5636 the minimum and likely ones.
5637 The likely estimate is the same as the minimum in that case to give a
5638 conservative behavior of auto-vectorizing with VLA when it is a win
5639 even for VLA vectorization.
5640 When VLA width information is available VAL.coeffs[1] is multiplied by
5641 the number of VLA chunks over the initial VLS bits. */
5642 HOST_WIDE_INT
5643 estimated_poly_value (poly_int64 val, unsigned int kind)
5645 unsigned int width_source
5646 = BITS_PER_RISCV_VECTOR.is_constant ()
5647 ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5648 : (unsigned int) RVV_VECTOR_BITS_SCALABLE;
5650 /* If there is no core-specific information then the minimum and likely
5651 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5652 the architectural maximum of 65536 bits. */
5653 unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5654 if (width_source == RVV_VECTOR_BITS_SCALABLE)
5655 switch (kind)
5657 case POLY_VALUE_MIN:
5658 case POLY_VALUE_LIKELY:
5659 return val.coeffs[0];
5661 case POLY_VALUE_MAX:
5662 return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5665 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5666 lowest as likely. This could be made more general if future -mtune
5667 options need it to be. */
5668 if (kind == POLY_VALUE_MAX)
5669 width_source = 1 << floor_log2 (width_source);
5670 else
5671 width_source = least_bit_hwi (width_source);
5673 /* If the core provides width information, use that. */
5674 HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5675 return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5678 /* Return true it is whole register-register move. */
5679 bool
5680 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
5682 /* An operation is a whole-register move if either
5683 (1) Its vlmax operand equals VLMAX
5684 (2) Its vl operand equals the number of units of its mode. */
5685 if (register_operand (ops[0], mode)
5686 && register_operand (ops[3], mode)
5687 && satisfies_constraint_vu (ops[2])
5688 && satisfies_constraint_Wc1 (ops[1]))
5690 if (INTVAL (ops[avl_type_index]) == VLMAX)
5691 return true;
5692 /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5693 into NON-VLMAX with LEN = NUNITS. */
5694 else if (CONST_INT_P (ops[4])
5695 && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
5696 return true;
5698 return false;
5701 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
5702 bool
5703 splat_to_scalar_move_p (rtx *ops)
5705 return satisfies_constraint_Wc1 (ops[1])
5706 && satisfies_constraint_vu (ops[2])
5707 && !MEM_P (ops[3])
5708 && satisfies_constraint_k01 (ops[4])
5709 && INTVAL (ops[7]) == NONVLMAX
5710 && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
5713 } // namespace riscv_vector