1 /* Subroutines used for code generation for RISC-V 'V' Extension for
3 Copyright (C) 2022-2025 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
26 #define RVV_INSN_OPERANDS_MAX 11
30 #include "coretypes.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
39 #include "stringpool.h"
46 #include "targhooks.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
56 using namespace riscv_vector
;
58 namespace riscv_vector
{
60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli. */
62 imm_avl_p (machine_mode mode
)
64 poly_uint64 nunits
= GET_MODE_NUNITS (mode
);
66 return nunits
.is_constant ()
67 /* The vsetivli can only hold register 0~31. */
68 ? (IN_RANGE (nunits
.to_constant (), 0, 31))
69 /* Only allowed in VLS-VLMAX mode. */
73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
75 is_vlmax_len_p (machine_mode mode
, rtx len
)
78 return poly_int_rtx_p (len
, &value
)
79 && known_eq (value
, GET_MODE_NUNITS (mode
));
82 /* Helper functions for insn_flags && insn_types */
84 /* Return true if caller need pass mask operand for insn pattern with
88 need_mask_operand_p (unsigned insn_flags
)
90 return (insn_flags
& HAS_MASK_P
)
91 && !(insn_flags
& (USE_ONE_TRUE_MASK_P
| USE_ALL_TRUES_MASK_P
));
94 template <int MAX_OPERANDS
> class insn_expander
97 insn_expander () = delete;
99 insn_expander (unsigned insn_flags
, bool vlmax_p
)
100 : m_insn_flags (insn_flags
), m_opno (0), m_vlmax_p (vlmax_p
),
106 void check_insn_flags () const
108 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
109 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
110 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
112 if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
113 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
114 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
116 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
117 gcc_assert (!((m_insn_flags
& USE_ONE_TRUE_MASK_P
)
118 && (m_insn_flags
& USE_ALL_TRUES_MASK_P
)));
120 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
121 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
122 gcc_assert ((m_insn_flags
& HAS_MERGE_P
));
124 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
126 !((m_insn_flags
& TU_POLICY_P
) && (m_insn_flags
& TDEFAULT_POLICY_P
)));
128 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
130 !((m_insn_flags
& MU_POLICY_P
) && (m_insn_flags
& MDEFAULT_POLICY_P
)));
132 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
135 !((m_insn_flags
& NULLARY_OP_P
)
136 && ((m_insn_flags
& UNARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
137 || (m_insn_flags
& TERNARY_OP_P
))));
139 !((m_insn_flags
& UNARY_OP_P
)
140 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
141 || (m_insn_flags
& TERNARY_OP_P
))));
143 !((m_insn_flags
& BINARY_OP_P
)
144 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
145 || (m_insn_flags
& TERNARY_OP_P
))));
147 !((m_insn_flags
& TERNARY_OP_P
)
148 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
149 || (m_insn_flags
& BINARY_OP_P
))));
152 void set_vl (rtx vl
) { m_vl_op
= vl
; }
154 void add_output_operand (rtx x
, machine_mode mode
)
156 create_output_operand (&m_ops
[m_opno
++], x
, mode
);
157 gcc_assert (m_opno
<= MAX_OPERANDS
);
159 void add_input_operand (rtx x
, machine_mode mode
)
161 create_input_operand (&m_ops
[m_opno
++], x
, mode
);
162 gcc_assert (m_opno
<= MAX_OPERANDS
);
164 void add_all_one_mask_operand (machine_mode mask_mode
)
166 add_input_operand (CONSTM1_RTX (mask_mode
), mask_mode
);
168 void add_first_one_true_mask_operand (machine_mode mask_mode
)
170 add_input_operand (gen_scalar_move_mask (mask_mode
), mask_mode
);
172 void add_vundef_operand (machine_mode dest_mode
)
174 add_input_operand (RVV_VUNDEF (dest_mode
), dest_mode
);
176 void add_policy_operand ()
178 if (m_insn_flags
& TU_POLICY_P
)
180 rtx tail_policy_rtx
= gen_int_mode (TAIL_UNDISTURBED
, Pmode
);
181 add_input_operand (tail_policy_rtx
, Pmode
);
183 else if (m_insn_flags
& TDEFAULT_POLICY_P
)
185 rtx tail_policy_rtx
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
186 add_input_operand (tail_policy_rtx
, Pmode
);
189 if (m_insn_flags
& MU_POLICY_P
)
191 rtx mask_policy_rtx
= gen_int_mode (MASK_UNDISTURBED
, Pmode
);
192 add_input_operand (mask_policy_rtx
, Pmode
);
194 else if (m_insn_flags
& MDEFAULT_POLICY_P
)
196 rtx mask_policy_rtx
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
197 add_input_operand (mask_policy_rtx
, Pmode
);
200 void add_avl_type_operand (avl_type type
)
202 add_input_operand (gen_int_mode (type
, Pmode
), Pmode
);
206 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode
)
208 rtx frm_rtx
= gen_int_mode (rounding_mode
, Pmode
);
209 add_input_operand (frm_rtx
, Pmode
);
213 add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode
)
215 rtx frm_rtx
= gen_int_mode (rounding_mode
, Pmode
);
216 add_input_operand (frm_rtx
, Pmode
);
219 /* Return the vtype mode based on insn_flags.
220 vtype mode mean the mode vsetvl insn set. */
222 get_vtype_mode (rtx
*ops
)
224 machine_mode vtype_mode
;
225 if (m_insn_flags
& VTYPE_MODE_FROM_OP1_P
)
226 vtype_mode
= GET_MODE (ops
[1]);
228 vtype_mode
= GET_MODE (ops
[0]);
232 void emit_insn (enum insn_code icode
, rtx
*ops
)
236 /* It's true if any operand is memory operand. */
237 bool any_mem_p
= false;
239 machine_mode vtype_mode
= get_vtype_mode (ops
);
240 machine_mode mask_mode
= get_mask_mode (vtype_mode
);
242 /* Add dest operand. */
243 if (m_insn_flags
& HAS_DEST_P
)
245 rtx op
= ops
[opno
++];
246 any_mem_p
|= MEM_P (op
);
247 add_output_operand (op
, GET_MODE (op
));
250 /* Add mask operand. */
251 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
252 add_first_one_true_mask_operand (mask_mode
);
253 else if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
254 add_all_one_mask_operand (mask_mode
);
255 else if (m_insn_flags
& HAS_MASK_P
)
257 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
258 gcc_assert (mode
!= VOIDmode
);
259 add_input_operand (ops
[opno
++], mode
);
262 /* Add merge operand. */
263 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
264 /* Same as dest operand. */
265 add_vundef_operand (GET_MODE (ops
[0]));
266 else if (m_insn_flags
& HAS_MERGE_P
)
268 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
269 gcc_assert (mode
!= VOIDmode
);
270 add_input_operand (ops
[opno
++], mode
);
273 if (m_insn_flags
& NULLARY_OP_P
)
275 else if (m_insn_flags
& UNARY_OP_P
)
277 else if (m_insn_flags
& BINARY_OP_P
)
279 else if (m_insn_flags
& TERNARY_OP_P
)
284 /* Add the remain operands. */
285 for (; num_ops
; num_ops
--, opno
++)
287 any_mem_p
|= MEM_P (ops
[opno
]);
288 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
289 /* 'create_input_operand doesn't allow VOIDmode.
290 According to vector.md, we may have some patterns that do not have
291 explicit machine mode specifying the operand. Such operands are
293 if (mode
== VOIDmode
)
296 /* Early assertion ensures same mode since maybe_legitimize_operand
298 machine_mode required_mode
= GET_MODE (ops
[opno
]);
299 if (required_mode
!= VOIDmode
&& required_mode
!= mode
)
300 internal_error ("expected mode %s for operand %d of "
301 "insn %s but got mode %s.\n",
302 GET_MODE_NAME (mode
),
304 insn_data
[(int) icode
].name
,
305 GET_MODE_NAME (required_mode
));
307 add_input_operand (ops
[opno
], mode
);
310 /* Add vl operand. */
315 if (riscv_v_ext_vls_mode_p (vtype_mode
))
317 /* VLS modes always set VSETVL by
318 "vsetvl zero, rs1/imm". */
319 poly_uint64 nunits
= GET_MODE_NUNITS (vtype_mode
);
320 len
= gen_int_mode (nunits
, Pmode
);
323 else if (can_create_pseudo_p ())
325 len
= gen_reg_rtx (Pmode
);
326 emit_vlmax_vsetvl (vtype_mode
, len
);
330 gcc_assert (len
!= NULL_RTX
);
331 add_input_operand (len
, Pmode
);
333 /* Add tail and mask policy operands. */
334 add_policy_operand ();
336 /* Add avl_type operand. */
337 add_avl_type_operand (
338 vls_p
? avl_type::VLS
339 : (m_vlmax_p
? avl_type::VLMAX
: avl_type::NONVLMAX
));
341 /* Add rounding mode operand. */
342 if (m_insn_flags
& FRM_DYN_P
)
343 add_rounding_mode_operand (FRM_DYN
);
344 else if (m_insn_flags
& FRM_RUP_P
)
345 add_rounding_mode_operand (FRM_RUP
);
346 else if (m_insn_flags
& FRM_RDN_P
)
347 add_rounding_mode_operand (FRM_RDN
);
348 else if (m_insn_flags
& FRM_RMM_P
)
349 add_rounding_mode_operand (FRM_RMM
);
350 else if (m_insn_flags
& FRM_RNE_P
)
351 add_rounding_mode_operand (FRM_RNE
);
352 else if (m_insn_flags
& VXRM_RNU_P
)
353 add_rounding_mode_operand (VXRM_RNU
);
354 else if (m_insn_flags
& VXRM_RDN_P
)
355 add_rounding_mode_operand (VXRM_RDN
);
358 if (insn_data
[(int) icode
].n_operands
!= m_opno
)
359 internal_error ("invalid number of operands for insn %s, "
360 "expected %d but got %d.\n",
361 insn_data
[(int) icode
].name
,
362 insn_data
[(int) icode
].n_operands
, m_opno
);
364 expand (icode
, any_mem_p
);
367 void expand (enum insn_code icode
, bool temporary_volatile_p
= false)
369 if (temporary_volatile_p
)
371 temporary_volatile_ok
v (true);
372 expand_insn (icode
, m_opno
, m_ops
);
375 expand_insn (icode
, m_opno
, m_ops
);
379 unsigned m_insn_flags
;
383 expand_operand m_ops
[MAX_OPERANDS
];
386 /* Emit an RVV insn with a vector length that equals the number of units of the
387 vector mode. For VLA modes this corresponds to VLMAX.
389 Unless the vector length can be encoded in the vsetivl[i] instruction this
390 function must only be used as long as we can create pseudo registers. This is
391 because it will set a pseudo register to VLMAX using vsetvl and use this as
392 definition for the vector length. */
394 emit_vlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
)
396 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
397 gcc_assert (can_create_pseudo_p () || imm_avl_p (e
.get_vtype_mode (ops
)));
399 e
.emit_insn ((enum insn_code
) icode
, ops
);
402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
403 registers anymore. This function, however, takes a predefined vector length
404 from the value in VL. */
406 emit_vlmax_insn_lra (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
408 gcc_assert (!can_create_pseudo_p ());
409 machine_mode mode
= GET_MODE (ops
[0]);
411 if (imm_avl_p (mode
))
413 /* Even though VL is a real hardreg already allocated since
414 it is post-RA now, we still gain benefits that we emit
415 vsetivli zero, imm instead of vsetvli VL, zero which is
416 we can be more flexible in post-RA instruction scheduling. */
417 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
418 e
.set_vl (gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
));
419 e
.emit_insn ((enum insn_code
) icode
, ops
);
423 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
425 e
.emit_insn ((enum insn_code
) icode
, ops
);
429 /* Emit an RVV insn with a predefined vector length. Contrary to
430 emit_vlmax_insn the instruction's vector length is not deduced from its mode
431 but taken from the value in VL. */
433 emit_nonvlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
435 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
437 e
.emit_insn ((enum insn_code
) icode
, ops
);
440 /* Return true if the vector duplicated by a super element which is the fusion
441 of consecutive elements.
443 v = { a, b, a, b } super element = ab, v = { ab, ab } */
445 rvv_builder::can_duplicate_repeating_sequence_p ()
447 poly_uint64 new_size
= exact_div (full_nelts (), npatterns ());
448 unsigned int new_inner_size
= m_inner_bits_size
* npatterns ();
449 if (m_inner_mode
== Pmode
450 || !int_mode_for_size (new_inner_size
, 0).exists (&m_new_inner_mode
)
451 || GET_MODE_SIZE (m_new_inner_mode
) > UNITS_PER_WORD
452 || !get_vector_mode (m_new_inner_mode
, new_size
).exists (&m_new_mode
))
454 return repeating_sequence_p (0, encoded_nelts (), npatterns ());
457 /* Return true if the vector is a simple sequence with one pattern and all
458 elements the same. */
460 rvv_builder::is_repeating_sequence ()
462 if (npatterns () > 1)
464 return repeating_sequence_p (0, encoded_nelts (), 1);
467 /* Return true if it is a repeating sequence that using
468 merge approach has better codegen than using default
469 approach (slide1down).
472 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
477 for merging a we need mask 101010....
478 for merging b we need mask 010101....
480 Foreach element in the npattern, we need to build a mask in scalar register.
481 Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
482 instructions and 1 scalar move to v0 register. Finally we need vector merge
488 vmerge.vxm v9, v9, a1, v0
490 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
491 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
492 So return true in this case as it is profitable.
495 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
500 COST of merge approach = (3 + 1) * npatterns = 24
501 COST of slide1down approach = nelts = 16
502 Return false in this case as it is NOT profitable in merge approach.
505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
507 if (inner_bytes_size () > UNITS_PER_WORD
)
510 unsigned int nelts
= full_nelts ().to_constant ();
512 if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
515 unsigned int merge_cost
= 1;
516 unsigned int build_merge_mask_cost
= 3;
517 unsigned int slide1down_cost
= nelts
;
519 return (build_merge_mask_cost
+ merge_cost
) * npatterns () < slide1down_cost
;
522 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
526 int nelts
= full_nelts ().to_constant ();
527 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
528 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
530 /* ??? Current heuristic we do is we do combine 2 vectors
532 1. # of leading same elements is equal to # of trailing same elements.
533 2. Both of above are equal to nelts / 2.
534 Otherwise, it is not profitable. */
535 return leading_ndups
== trailing_ndups
&& trailing_ndups
== nelts
/ 2;
538 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
540 rvv_builder::combine_sequence_use_merge_profitable_p ()
542 int nelts
= full_nelts ().to_constant ();
543 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
544 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
545 int nregs
= riscv_get_v_regno_alignment (int_mode ());
547 if (leading_ndups
+ trailing_ndups
!= nelts
)
550 /* Leading elements num > 255 which exceeds the maximum value
551 of QImode, we will need to use HImode. */
553 if (leading_ndups
> 255 || nregs
> 2)
555 if (!get_vector_mode (HImode
, nelts
).exists (&mode
))
557 /* We will need one more AVL/VL toggling vsetvl instruction. */
558 return leading_ndups
> 4 && trailing_ndups
> 4;
561 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
562 consume 3 slide instructions. */
563 return leading_ndups
> 3 && trailing_ndups
> 3;
566 /* Merge the repeating sequence into a single element and return the RTX. */
568 rvv_builder::get_merged_repeating_sequence ()
570 scalar_int_mode mode
= Pmode
;
571 rtx target
= gen_reg_rtx (mode
);
572 emit_move_insn (target
, const0_rtx
);
573 rtx imm
= gen_int_mode ((1ULL << m_inner_bits_size
) - 1, mode
);
574 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
575 for (unsigned int i
= 0; i
< npatterns (); i
++)
577 unsigned int loc
= m_inner_bits_size
* i
;
578 rtx shift
= gen_int_mode (loc
, mode
);
579 rtx ele
= gen_lowpart (mode
, elt (i
));
580 rtx tmp
= expand_simple_binop (mode
, AND
, ele
, imm
, NULL_RTX
, false,
582 rtx tmp2
= expand_simple_binop (mode
, ASHIFT
, tmp
, shift
, NULL_RTX
, false,
584 rtx tmp3
= expand_simple_binop (mode
, IOR
, tmp2
, target
, NULL_RTX
, false,
586 emit_move_insn (target
, tmp3
);
588 if (GET_MODE_SIZE (m_new_inner_mode
) < UNITS_PER_WORD
)
589 return gen_lowpart (m_new_inner_mode
, target
);
593 /* Get the mask for merge approach.
595 Consider such following case:
596 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
597 To merge "a", the mask should be 1010....
598 To merge "b", the mask should be 0101....
601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern
,
602 machine_mode inner_mode
) const
604 unsigned HOST_WIDE_INT mask
= 0;
605 unsigned HOST_WIDE_INT base_mask
= (1ULL << index_in_pattern
);
606 /* Here we construct a mask pattern that will later be broadcast
607 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
608 is determined by the length of a vector element (ELEN) and not by
609 XLEN so make sure we do not exceed it. One example is -march=zve32*
610 which mandates ELEN == 32 but can be combined with -march=rv64
612 unsigned int elen
= TARGET_VECTOR_ELEN_64
? 64 : 32;
614 gcc_assert (elen
% npatterns () == 0);
616 int limit
= elen
/ npatterns ();
618 for (int i
= 0; i
< limit
; i
++)
619 mask
|= base_mask
<< (i
* npatterns ());
621 return gen_int_mode (mask
, inner_mode
);
624 /* Return true if the variable-length vector is single step.
625 Single step means step all patterns in NPATTERNS are equal.
626 Consider this following case:
628 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
629 { 0, 2, 2, 4, 4, 6, ... }
630 First pattern: step1 = 2 - 0 = 2
632 Second pattern: step1 = 4 - 2 = 2
634 Since all steps of NPATTERNS are equal step = 2.
635 Return true in this case.
637 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
638 { 0, 1, 2, 4, 4, 7, ... }
639 First pattern: step1 = 2 - 0 = 2
641 Second pattern: step1 = 4 - 1 = 3
643 Since not all steps are equal, return false. */
645 rvv_builder::single_step_npatterns_p () const
647 if (nelts_per_pattern () != 3)
651 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
652 for (unsigned int i
= 0; i
< npatterns (); i
++)
654 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
655 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
656 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
657 poly_int64 diff1
= ele1
- ele0
;
658 poly_int64 diff2
= ele2
- ele1
;
659 if (maybe_ne (step
, diff1
) || maybe_ne (step
, diff2
))
665 /* Return true if the diff between const vector and vid sequence
666 is repeated. For example as below cases:
667 The diff means the const vector - vid.
669 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
670 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
671 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
672 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
673 return TRUE for case 1.
676 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
677 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
678 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
679 The diff sequence {-4, 3} is not repeated in the npattern and
680 return FALSE for case 2. */
682 rvv_builder::npatterns_vid_diff_repeated_p () const
684 if (nelts_per_pattern () != 3)
686 else if (npatterns () == 0)
689 for (unsigned i
= 0; i
< npatterns (); i
++)
691 poly_int64 diff_0
= rtx_to_poly_int64 (elt (i
)) - i
;
693 = rtx_to_poly_int64 (elt (npatterns () + i
)) - npatterns () - i
;
695 if (maybe_ne (diff_0
, diff_1
))
702 /* Return true if the permutation consists of two
703 interleaved patterns with a constant step each.
704 TODO: We currently only support NPATTERNS = 2. */
706 rvv_builder::interleaved_stepped_npatterns_p () const
708 if (npatterns () != 2 || nelts_per_pattern () != 3)
710 for (unsigned int i
= 0; i
< npatterns (); i
++)
712 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
713 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
714 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
715 poly_int64 diff1
= ele1
- ele0
;
716 poly_int64 diff2
= ele2
- ele1
;
717 if (maybe_ne (diff1
, diff2
))
723 /* Return true if all elements of NPATTERNS are equal.
726 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
728 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
729 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
730 We don't need to check the elements[n] with n >= NPATTERNS since
731 they don't belong to the same pattern.
734 rvv_builder::npatterns_all_equal_p () const
736 poly_int64 ele0
= rtx_to_poly_int64 (elt (0));
737 for (unsigned int i
= 1; i
< npatterns (); i
++)
739 poly_int64 ele
= rtx_to_poly_int64 (elt (i
));
740 if (!known_eq (ele
, ele0
))
747 get_sew (machine_mode mode
)
749 unsigned int sew
= GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
751 : GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
755 /* Return true if X is a const_vector with all duplicate elements, which is in
756 the range between MINVAL and MAXVAL. */
758 const_vec_all_same_in_range_p (rtx x
, HOST_WIDE_INT minval
,
759 HOST_WIDE_INT maxval
)
762 return (const_vec_duplicate_p (x
, &elt
) && CONST_INT_P (elt
)
763 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
766 /* Return true if VEC is a constant in which every element is in the range
767 [MINVAL, MAXVAL]. The elements do not need to have the same value.
769 This function also exists in aarch64, we may unify it in middle-end in the
773 const_vec_all_in_range_p (rtx vec
, poly_int64 minval
, poly_int64 maxval
)
775 if (!CONST_VECTOR_P (vec
)
776 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
780 if (!CONST_VECTOR_STEPPED_P (vec
))
781 nunits
= const_vector_encoded_nelts (vec
);
782 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
785 for (int i
= 0; i
< nunits
; i
++)
787 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
789 if (!poly_int_rtx_p (vec_elem
, &value
)
790 || maybe_lt (value
, minval
)
791 || maybe_gt (value
, maxval
))
797 /* Returns true if the vector's elements are all duplicates in
798 range -16 ~ 15 integer or 0.0 floating-point. */
801 valid_vec_immediate_p (rtx x
)
803 return (satisfies_constraint_vi (x
) || satisfies_constraint_Wc0 (x
));
806 /* Return a const vector of VAL. The VAL can be either const_int or
810 gen_const_vector_dup (machine_mode mode
, poly_int64 val
)
812 scalar_mode smode
= GET_MODE_INNER (mode
);
813 rtx c
= gen_int_mode (val
, smode
);
814 if (!val
.is_constant () && GET_MODE_SIZE (smode
) > GET_MODE_SIZE (Pmode
))
816 /* When VAL is const_poly_int value, we need to explicitly broadcast
817 it into a vector using RVV broadcast instruction. */
818 return expand_vector_broadcast (mode
, c
);
820 return gen_const_vec_duplicate (mode
, c
);
823 /* Emit a vlmax vsetvl instruction. This should only be used when
824 optimization is disabled or after vsetvl insertion pass. */
826 emit_hard_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
828 unsigned int sew
= get_sew (vmode
);
829 emit_insn (gen_vsetvl (Pmode
, vl
, RVV_VLMAX
, gen_int_mode (sew
, Pmode
),
830 gen_int_mode (get_vlmul (vmode
), Pmode
), const0_rtx
,
835 emit_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
837 unsigned int sew
= get_sew (vmode
);
838 enum vlmul_type vlmul
= get_vlmul (vmode
);
839 unsigned int ratio
= calculate_ratio (sew
, vlmul
);
842 emit_hard_vlmax_vsetvl (vmode
, vl
);
844 emit_insn (gen_vlmax_avl (Pmode
, vl
, gen_int_mode (ratio
, Pmode
)));
847 /* Calculate SEW/LMUL ratio. */
849 calculate_ratio (unsigned int sew
, enum vlmul_type vlmul
)
881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
882 compile-time unknown). ZVL means that the vector-length is specific
883 (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
884 auto-vectorization using VLMAX vsetvl configuration. */
886 autovec_use_vlmax_p (void)
888 return rvv_vector_bits
== RVV_VECTOR_BITS_SCALABLE
889 || rvv_vector_bits
== RVV_VECTOR_BITS_ZVL
;
892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
893 is a const duplicate vector. Otherwise, emit vrgather.vv. */
895 emit_vlmax_gather_insn (rtx target
, rtx op
, rtx sel
)
899 machine_mode data_mode
= GET_MODE (target
);
900 machine_mode sel_mode
= GET_MODE (sel
);
901 if (const_vec_duplicate_p (sel
, &elt
))
903 icode
= code_for_pred_gather_scalar (data_mode
);
906 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
907 icode
= code_for_pred_gatherei16 (data_mode
);
909 icode
= code_for_pred_gather (data_mode
);
910 rtx ops
[] = {target
, op
, sel
};
911 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
915 emit_vlmax_masked_gather_mu_insn (rtx target
, rtx op
, rtx sel
, rtx mask
)
919 machine_mode data_mode
= GET_MODE (target
);
920 machine_mode sel_mode
= GET_MODE (sel
);
921 if (const_vec_duplicate_p (sel
, &elt
))
923 icode
= code_for_pred_gather_scalar (data_mode
);
926 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
927 icode
= code_for_pred_gatherei16 (data_mode
);
929 icode
= code_for_pred_gather (data_mode
);
930 rtx ops
[] = {target
, mask
, target
, op
, sel
};
931 emit_vlmax_insn (icode
, BINARY_OP_TAMU
, ops
);
934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
935 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
937 There is no inverse vdecompress provided, as this operation can be readily
938 synthesized using iota and a masked vrgather:
940 Desired functionality of 'vdecompress'
941 7 6 5 4 3 2 1 0 # vid
943 e d c b a # packed vector of 5 elements
944 1 0 0 1 1 1 0 1 # mask vector of 8 elements
945 p q r s t u v w # destination register before vdecompress
947 e q r d c b v a # result of vdecompress
949 # v1 holds packed data
950 # v11 holds input expanded vector and result
951 viota.m v10, v0 # Calc iota from mask in v0
952 vrgather.vv v11, v1, v10, v0.t # Expand into destination
953 p q r s t u v w # v11 destination register
954 e d c b a # v1 source vector
955 1 0 0 1 1 1 0 1 # v0 mask vector
957 4 4 4 3 2 1 1 0 # v10 result of viota.m
958 e q r d c b v a # v11 destination after vrgather using viota.m under mask
961 emit_vlmax_decompress_insn (rtx target
, rtx op0
, rtx op1
, rtx mask
)
963 machine_mode data_mode
= GET_MODE (target
);
964 machine_mode sel_mode
= related_int_vector_mode (data_mode
).require ();
965 if (GET_MODE_INNER (data_mode
) == QImode
)
966 sel_mode
= get_vector_mode (HImode
, GET_MODE_NUNITS (data_mode
)).require ();
968 rtx sel
= gen_reg_rtx (sel_mode
);
969 rtx iota_ops
[] = {sel
, mask
};
970 emit_vlmax_insn (code_for_pred_iota (sel_mode
), UNARY_OP
, iota_ops
);
971 emit_vlmax_gather_insn (target
, op0
, sel
);
972 emit_vlmax_masked_gather_mu_insn (target
, op1
, sel
, mask
);
975 /* Emit merge instruction. */
978 get_repeating_sequence_dup_machine_mode (const rvv_builder
&builder
,
979 machine_mode mask_bit_mode
)
981 unsigned mask_precision
= GET_MODE_PRECISION (mask_bit_mode
).to_constant ();
982 unsigned mask_scalar_size
= mask_precision
> builder
.inner_bits_size ()
983 ? builder
.inner_bits_size () : mask_precision
;
985 scalar_mode inner_mode
;
986 unsigned minimal_bits_size
;
988 switch (mask_scalar_size
)
992 minimal_bits_size
= TARGET_MIN_VLEN
/ 8; /* AKA RVVMF8. */
996 minimal_bits_size
= TARGET_MIN_VLEN
/ 4; /* AKA RVVMF4. */
1000 minimal_bits_size
= TARGET_MIN_VLEN
/ 2; /* AKA RVVMF2. */
1003 inner_mode
= DImode
;
1004 minimal_bits_size
= TARGET_MIN_VLEN
/ 1; /* AKA RVVM1. */
1011 gcc_assert (mask_precision
% mask_scalar_size
== 0);
1013 uint64_t dup_nunit
= mask_precision
> mask_scalar_size
1014 ? mask_precision
/ mask_scalar_size
: minimal_bits_size
/ mask_scalar_size
;
1016 return get_vector_mode (inner_mode
, dup_nunit
).require ();
1019 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1020 instructions to generate sequence for VID:
1022 VID = { 0, 1, 2, 3, ... }
1024 Otherwise, we use the VID argument directly. */
1027 expand_vec_series (rtx dest
, rtx base
, rtx step
, rtx vid
)
1029 machine_mode mode
= GET_MODE (dest
);
1030 poly_int64 nunits_m1
= GET_MODE_NUNITS (mode
) - 1;
1032 rtx result
= register_operand (dest
, mode
) ? dest
: gen_reg_rtx (mode
);
1034 /* VECT_IV = BASE + I * STEP. */
1036 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1037 bool reverse_p
= !vid
&& rtx_equal_p (step
, constm1_rtx
)
1038 && poly_int_rtx_p (base
, &value
)
1039 && known_eq (nunits_m1
, value
);
1042 vid
= gen_reg_rtx (mode
);
1044 emit_vlmax_insn (code_for_pred_series (mode
), NULLARY_OP
, op
);
1051 {nunits - 1, nunits - 2, ... , 0}.
1052 nunits can be either const_int or const_poly_int.
1056 vrsub nunits - 1, v. */
1058 = {result
, vid
, gen_int_mode (nunits_m1
, GET_MODE_INNER (mode
))};
1059 insn_code icode
= code_for_pred_sub_reverse_scalar (mode
);
1060 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1064 /* Step 2: Generate I * STEP.
1065 - STEP is 1, we don't emit any instructions.
1066 - STEP is power of 2, we use vsll.vi/vsll.vx.
1067 - STEP is non-power of 2, we use vmul.vx. */
1068 if (rtx_equal_p (step
, const1_rtx
))
1072 step_adj
= gen_reg_rtx (mode
);
1073 if (CONST_INT_P (step
) && pow2p_hwi (INTVAL (step
)))
1075 /* Emit logical left shift operation. */
1076 int shift
= exact_log2 (INTVAL (step
));
1077 rtx shift_amount
= gen_int_mode (shift
, Pmode
);
1078 insn_code icode
= code_for_pred_scalar (ASHIFT
, mode
);
1079 rtx ops
[] = {step_adj
, vid
, shift_amount
};
1080 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1084 insn_code icode
= code_for_pred_scalar (MULT
, mode
);
1085 rtx ops
[] = {step_adj
, vid
, step
};
1086 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1090 /* Step 3: Generate BASE + I * STEP.
1091 - BASE is 0, use result of vid.
1092 - BASE is not 0, we use vadd.vx/vadd.vi. */
1093 if (rtx_equal_p (base
, const0_rtx
))
1094 emit_move_insn (result
, step_adj
);
1097 insn_code icode
= code_for_pred_scalar (PLUS
, mode
);
1098 rtx ops
[] = {result
, step_adj
, base
};
1099 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1104 emit_move_insn (dest
, result
);
1107 /* Subroutine of riscv_vector_expand_vector_init.
1109 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110 (b) Skip leading elements from BUILDER, which are the same as
1111 element NELTS_REQD - 1.
1112 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
1115 expand_vector_init_insert_elems (rtx target
, const rvv_builder
&builder
,
1118 machine_mode mode
= GET_MODE (target
);
1119 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
1120 emit_move_insn (target
, dup
);
1121 int ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
1122 for (int i
= ndups
; i
< nelts_reqd
; i
++)
1125 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1DOWN
: UNSPEC_VSLIDE1DOWN
;
1126 insn_code icode
= code_for_pred_slide (unspec
, mode
);
1127 rtx ops
[] = {target
, target
, builder
.elt (i
)};
1128 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1132 /* Subroutine of expand_vec_init to handle case
1133 when all trailing elements of builder are same.
1134 This works as follows:
1135 (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136 (b) Insert remaining elements in TARGET using insr.
1138 ??? The heuristic used is to do above if number of same trailing elements
1139 is greater than leading_ndups, loosely based on
1140 heuristic from mostly_zeros_p. May need fine-tuning. */
1143 expand_vector_init_trailing_same_elem (rtx target
,
1144 const rtx_vector_builder
&builder
,
1147 int leading_ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
1148 int trailing_ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
1149 machine_mode mode
= GET_MODE (target
);
1151 if (trailing_ndups
> leading_ndups
)
1153 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (nelts_reqd
- 1));
1154 for (int i
= nelts_reqd
- trailing_ndups
- 1; i
>= 0; i
--)
1157 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
1158 insn_code icode
= code_for_pred_slide (unspec
, mode
);
1159 rtx tmp
= gen_reg_rtx (mode
);
1160 rtx ops
[] = {tmp
, dup
, builder
.elt (i
)};
1161 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1162 /* slide1up need source and dest to be different REG. */
1166 emit_move_insn (target
, dup
);
1174 expand_const_vector (rtx target
, rtx src
)
1176 machine_mode mode
= GET_MODE (target
);
1177 rtx result
= register_operand (target
, mode
) ? target
: gen_reg_rtx (mode
);
1179 if (const_vec_duplicate_p (src
, &elt
))
1181 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1183 gcc_assert (rtx_equal_p (elt
, const0_rtx
)
1184 || rtx_equal_p (elt
, const1_rtx
));
1185 rtx ops
[] = {result
, src
};
1186 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_MASK_OP
, ops
);
1188 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189 we use vmv.v.i instruction. */
1190 else if (valid_vec_immediate_p (src
))
1192 rtx ops
[] = {result
, src
};
1193 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP
, ops
);
1197 /* Emit vec_duplicate<mode> split pattern before RA so that
1198 we could have a better optimization opportunity in LICM
1199 which will hoist vmv.v.x outside the loop and in fwprop && combine
1200 which will transform 'vv' into 'vx' instruction.
1202 The reason we don't emit vec_duplicate<mode> split pattern during
1203 RA since the split stage after RA is a too late stage to generate
1204 RVV instruction which need an additional register (We can't
1205 allocate a new register after RA) for VL operand of vsetvl
1206 instruction (vsetvl a5, zero). */
1207 if (lra_in_progress
)
1209 rtx ops
[] = {result
, elt
};
1210 emit_vlmax_insn (code_for_pred_broadcast (mode
), UNARY_OP
, ops
);
1214 struct expand_operand ops
[2];
1215 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
1216 gcc_assert (icode
!= CODE_FOR_nothing
);
1217 create_output_operand (&ops
[0], result
, mode
);
1218 create_input_operand (&ops
[1], elt
, GET_MODE_INNER (mode
));
1219 expand_insn (icode
, 2, ops
);
1220 result
= ops
[0].value
;
1224 if (result
!= target
)
1225 emit_move_insn (target
, result
);
1229 /* Support scalable const series vector. */
1231 if (const_vec_series_p (src
, &base
, &step
))
1233 expand_vec_series (result
, base
, step
);
1235 if (result
!= target
)
1236 emit_move_insn (target
, result
);
1240 /* Handle variable-length vector. */
1241 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
1242 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
1243 rvv_builder
builder (mode
, npatterns
, nelts_per_pattern
);
1244 for (unsigned int i
= 0; i
< nelts_per_pattern
; i
++)
1246 for (unsigned int j
= 0; j
< npatterns
; j
++)
1247 builder
.quick_push (CONST_VECTOR_ELT (src
, i
* npatterns
+ j
));
1249 builder
.finalize ();
1251 if (CONST_VECTOR_DUPLICATE_P (src
))
1253 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256 The elements within NPATTERNS are not necessary regular. */
1257 if (builder
.can_duplicate_repeating_sequence_p ())
1259 /* We handle the case that we can find a vector container to hold
1260 element bitsize = NPATTERNS * ele_bitsize.
1262 NPATTERNS = 8, element width = 8
1263 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264 In this case, we can combine NPATTERNS element into a larger
1265 element. Use element width = 64 and broadcast a vector with
1266 all element equal to 0x0706050403020100. */
1267 rtx ele
= builder
.get_merged_repeating_sequence ();
1268 rtx dup
= expand_vector_broadcast (builder
.new_mode (), ele
);
1269 emit_move_insn (result
, gen_lowpart (mode
, dup
));
1273 /* We handle the case that we can't find a vector container to hold
1274 element bitsize = NPATTERNS * ele_bitsize.
1276 NPATTERNS = 8, element width = 16
1277 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278 Since NPATTERNS * element width = 128, we can't find a container
1281 In this case, we use NPATTERNS merge operations to generate such
1283 unsigned int nbits
= npatterns
- 1;
1285 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1286 rtx vid
= gen_reg_rtx (builder
.int_mode ());
1288 emit_vlmax_insn (code_for_pred_series (builder
.int_mode ()),
1291 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1292 rtx vid_repeat
= gen_reg_rtx (builder
.int_mode ());
1293 rtx and_ops
[] = {vid_repeat
, vid
,
1294 gen_int_mode (nbits
, builder
.inner_int_mode ())};
1295 emit_vlmax_insn (code_for_pred_scalar (AND
, builder
.int_mode ()),
1296 BINARY_OP
, and_ops
);
1298 rtx tmp1
= gen_reg_rtx (builder
.mode ());
1299 rtx dup_ops
[] = {tmp1
, builder
.elt (0)};
1300 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()), UNARY_OP
,
1302 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
1304 /* Generate mask according to i. */
1305 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1306 rtx const_vec
= gen_const_vector_dup (builder
.int_mode (), i
);
1307 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
1309 /* Merge scalar to each i. */
1310 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1311 rtx merge_ops
[] = {tmp2
, tmp1
, builder
.elt (i
), mask
};
1312 insn_code icode
= code_for_pred_merge_scalar (builder
.mode ());
1313 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
1316 emit_move_insn (result
, tmp1
);
1319 else if (CONST_VECTOR_STEPPED_P (src
))
1321 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
1322 if (builder
.single_step_npatterns_p ())
1324 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1327 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1328 rtx vid
= gen_reg_rtx (builder
.mode ());
1329 rtx vid_ops
[] = {vid
};
1330 icode
= code_for_pred_series (builder
.mode ());
1331 emit_vlmax_insn (icode
, NULLARY_OP
, vid_ops
);
1333 if (builder
.npatterns_all_equal_p ())
1335 /* Generate the variable-length vector following this rule:
1336 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1339 /* We want to create a pattern where value[idx] = floor (idx /
1340 NPATTERNS). As NPATTERNS is always a power of two we can
1341 rewrite this as = idx & -NPATTERNS. */
1342 /* Step 2: VID AND -NPATTERNS:
1343 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1346 = gen_int_mode (-builder
.npatterns (), builder
.inner_mode ());
1347 rtx tmp1
= gen_reg_rtx (builder
.mode ());
1348 rtx and_ops
[] = {tmp1
, vid
, imm
};
1349 icode
= code_for_pred_scalar (AND
, builder
.mode ());
1350 emit_vlmax_insn (icode
, BINARY_OP
, and_ops
);
1352 /* Step 3: Convert to step size 1. */
1353 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1354 /* log2 (npatterns) to get the shift amount to convert
1355 Eg. { 0, 0, 0, 0, 4, 4, ... }
1356 into { 0, 0, 0, 0, 1, 1, ... }. */
1357 HOST_WIDE_INT shift_amt
= exact_log2 (builder
.npatterns ()) ;
1358 rtx shift
= gen_int_mode (shift_amt
, builder
.inner_mode ());
1359 rtx shift_ops
[] = {tmp2
, tmp1
, shift
};
1360 icode
= code_for_pred_scalar (ASHIFTRT
, builder
.mode ());
1361 emit_vlmax_insn (icode
, BINARY_OP
, shift_ops
);
1363 /* Step 4: Multiply to step size n. */
1364 HOST_WIDE_INT step_size
=
1365 INTVAL (builder
.elt (builder
.npatterns ()))
1366 - INTVAL (builder
.elt (0));
1367 rtx tmp3
= gen_reg_rtx (builder
.mode ());
1368 if (pow2p_hwi (step_size
))
1370 /* Power of 2 can be handled with a left shift. */
1371 HOST_WIDE_INT shift
= exact_log2 (step_size
);
1372 rtx shift_amount
= gen_int_mode (shift
, Pmode
);
1373 insn_code icode
= code_for_pred_scalar (ASHIFT
, mode
);
1374 rtx ops
[] = {tmp3
, tmp2
, shift_amount
};
1375 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1379 rtx mult_amt
= gen_int_mode (step_size
, builder
.inner_mode ());
1380 insn_code icode
= code_for_pred_scalar (MULT
, builder
.mode ());
1381 rtx ops
[] = {tmp3
, tmp2
, mult_amt
};
1382 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1385 /* Step 5: Add starting value to all elements. */
1386 HOST_WIDE_INT init_val
= INTVAL (builder
.elt (0));
1388 emit_move_insn (result
, tmp3
);
1391 rtx dup
= gen_const_vector_dup (builder
.mode (), init_val
);
1392 rtx add_ops
[] = {result
, tmp3
, dup
};
1393 icode
= code_for_pred (PLUS
, builder
.mode ());
1394 emit_vlmax_insn (icode
, BINARY_OP
, add_ops
);
1399 /* Generate the variable-length vector following this rule:
1400 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1402 if (builder
.npatterns_vid_diff_repeated_p ())
1404 /* Case 1: For example as below:
1405 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407 repeated as below after minus vid.
1408 {3, 1, -1, -3, 3, 1, -1, -3...}
1409 Then we can simplify the diff code gen to at most
1411 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1413 /* Step 1: Generate diff = TARGET - VID. */
1414 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1416 poly_int64 diff
= rtx_to_poly_int64 (builder
.elt (i
)) - i
;
1417 v
.quick_push (gen_int_mode (diff
, v
.inner_mode ()));
1420 /* Step 2: Generate result = VID + diff. */
1421 rtx vec
= v
.build ();
1422 rtx add_ops
[] = {result
, vid
, vec
};
1423 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1424 BINARY_OP
, add_ops
);
1428 /* Case 2: For example as below:
1429 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1431 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1433 /* Step 1: Generate { a, b, a, b, ... } */
1434 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1435 v
.quick_push (builder
.elt (i
));
1436 rtx new_base
= v
.build ();
1438 /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS). Â */
1440 = gen_int_mode (exact_log2 (builder
.npatterns ()),
1441 builder
.inner_mode ());
1442 rtx tmp1
= gen_reg_rtx (builder
.mode ());
1443 rtx shift_ops
[] = {tmp1
, vid
, shift_count
};
1444 emit_vlmax_insn (code_for_pred_scalar
1445 (LSHIFTRT
, builder
.mode ()), BINARY_OP
,
1448 /* Step 3: Generate tmp2 = tmp1 * step. Â */
1449 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1451 = simplify_binary_operation (MINUS
, builder
.inner_mode (),
1452 builder
.elt (v
.npatterns()),
1454 expand_vec_series (tmp2
, const0_rtx
, step
, tmp1
);
1456 /* Step 4: Generate result = tmp2 + new_base. Â */
1457 rtx add_ops
[] = {result
, tmp2
, new_base
};
1458 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1459 BINARY_OP
, add_ops
);
1463 else if (builder
.interleaved_stepped_npatterns_p ())
1465 rtx base1
= builder
.elt (0);
1466 rtx base2
= builder
.elt (1);
1468 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns ()))
1469 - rtx_to_poly_int64 (base1
);
1471 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns () + 1))
1472 - rtx_to_poly_int64 (base2
);
1474 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1475 integer vector mode to generate such vector efficiently.
1477 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1479 can be interpreted into:
1481 EEW = 32, { 2, 4, ... }.
1483 This only works as long as the larger type does not overflow
1484 as we can't guarantee a zero value for each second element
1485 of the sequence with smaller EEW.
1486 ??? For now we assume that no overflow happens with positive
1487 steps and forbid negative steps altogether. */
1488 unsigned int new_smode_bitsize
= builder
.inner_bits_size () * 2;
1489 scalar_int_mode new_smode
;
1490 machine_mode new_mode
;
1491 poly_uint64 new_nunits
1492 = exact_div (GET_MODE_NUNITS (builder
.mode ()), 2);
1493 if (known_ge (step1
, 0) && known_ge (step2
, 0)
1494 && int_mode_for_size (new_smode_bitsize
, 0).exists (&new_smode
)
1495 && get_vector_mode (new_smode
, new_nunits
).exists (&new_mode
))
1497 rtx tmp1
= gen_reg_rtx (new_mode
);
1498 base1
= gen_int_mode (rtx_to_poly_int64 (base1
), new_smode
);
1499 expand_vec_series (tmp1
, base1
, gen_int_mode (step1
, new_smode
));
1501 if (rtx_equal_p (base2
, const0_rtx
) && known_eq (step2
, 0))
1502 /* { 1, 0, 2, 0, ... }. */
1503 emit_move_insn (result
, gen_lowpart (mode
, tmp1
));
1504 else if (known_eq (step2
, 0))
1506 /* { 1, 1, 2, 1, ... }. */
1507 rtx scalar
= expand_simple_binop (
1509 gen_int_mode (rtx_to_poly_int64 (base2
), Xmode
),
1510 gen_int_mode (builder
.inner_bits_size (), Xmode
),
1511 NULL_RTX
, false, OPTAB_DIRECT
);
1512 scalar
= simplify_gen_subreg (new_smode
, scalar
, Xmode
, 0);
1513 rtx tmp2
= gen_reg_rtx (new_mode
);
1514 rtx ior_ops
[] = {tmp2
, tmp1
, scalar
};
1515 emit_vlmax_insn (code_for_pred_scalar (IOR
, new_mode
),
1516 BINARY_OP
, ior_ops
);
1517 emit_move_insn (result
, gen_lowpart (mode
, tmp2
));
1521 /* { 1, 3, 2, 6, ... }. */
1522 rtx tmp2
= gen_reg_rtx (new_mode
);
1523 base2
= gen_int_mode (rtx_to_poly_int64 (base2
), new_smode
);
1524 expand_vec_series (tmp2
, base2
,
1525 gen_int_mode (step2
, new_smode
));
1526 rtx shifted_tmp2
= expand_simple_binop (
1527 new_mode
, ASHIFT
, tmp2
,
1528 gen_int_mode (builder
.inner_bits_size (), Pmode
), NULL_RTX
,
1529 false, OPTAB_DIRECT
);
1530 rtx tmp3
= gen_reg_rtx (new_mode
);
1531 rtx ior_ops
[] = {tmp3
, tmp1
, shifted_tmp2
};
1532 emit_vlmax_insn (code_for_pred (IOR
, new_mode
), BINARY_OP
,
1534 emit_move_insn (result
, gen_lowpart (mode
, tmp3
));
1539 rtx vid
= gen_reg_rtx (mode
);
1540 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
1541 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1543 = expand_simple_binop (mode
, LSHIFTRT
, vid
, const1_rtx
,
1544 NULL_RTX
, false, OPTAB_DIRECT
);
1545 rtx tmp1
= gen_reg_rtx (mode
);
1546 rtx tmp2
= gen_reg_rtx (mode
);
1547 expand_vec_series (tmp1
, base1
,
1548 gen_int_mode (step1
, builder
.inner_mode ()),
1550 expand_vec_series (tmp2
, base2
,
1551 gen_int_mode (step2
, builder
.inner_mode ()),
1554 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1555 rtx and_vid
= gen_reg_rtx (mode
);
1556 rtx and_ops
[] = {and_vid
, vid
, const1_rtx
};
1557 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), BINARY_OP
,
1559 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1560 expand_vec_cmp (mask
, EQ
, and_vid
, CONST1_RTX (mode
));
1562 rtx ops
[] = {result
, tmp1
, tmp2
, mask
};
1563 emit_vlmax_insn (code_for_pred_merge (mode
), MERGE_OP
, ops
);
1567 /* TODO: We will enable more variable-length vector in the future. */
1573 if (result
!= target
)
1574 emit_move_insn (target
, result
);
1577 /* Get the frm mode with given CONST_INT rtx, the default mode is
1579 enum floating_point_rounding_mode
1580 get_frm_mode (rtx operand
)
1582 gcc_assert (CONST_INT_P (operand
));
1584 switch (INTVAL (operand
))
1605 /* Expand a pre-RA RVV data move from SRC to DEST.
1606 It expands move for RVV fractional vector modes.
1607 Return true if the move as already been emitted. */
1609 legitimize_move (rtx dest
, rtx
*srcp
)
1612 machine_mode mode
= GET_MODE (dest
);
1613 if (CONST_VECTOR_P (src
))
1615 expand_const_vector (dest
, src
);
1619 if (riscv_v_ext_vls_mode_p (mode
))
1621 if (GET_MODE_NUNITS (mode
).to_constant () <= 31)
1623 /* For NUNITS <= 31 VLS modes, we don't need extract
1624 scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1625 if (can_create_pseudo_p ())
1627 /* Need to force register if mem <- !reg. */
1628 if (MEM_P (dest
) && !REG_P (src
))
1629 *srcp
= force_reg (mode
, src
);
1634 else if (GET_MODE_NUNITS (mode
).to_constant () > 31 && lra_in_progress
)
1636 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1642 /* In order to decrease the memory traffic, we don't use whole register
1643 * load/store for the LMUL less than 1 and mask mode, so those case will
1644 * require one extra general purpose register, but it's not allowed during
1645 * LRA process, so we have a special move pattern used for LRA, which will
1646 * defer the expansion after LRA. */
1647 if ((known_lt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1648 || GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1651 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1655 if (known_ge (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1656 && GET_MODE_CLASS (mode
) != MODE_VECTOR_BOOL
)
1658 /* Need to force register if mem <- !reg. */
1659 if (MEM_P (dest
) && !REG_P (src
))
1660 *srcp
= force_reg (mode
, src
);
1666 if (register_operand (src
, mode
) && register_operand (dest
, mode
))
1668 emit_insn (gen_rtx_SET (dest
, src
));
1673 = GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
? UNARY_MASK_OP
: UNARY_OP
;
1674 if (!register_operand (src
, mode
) && !register_operand (dest
, mode
))
1676 rtx tmp
= gen_reg_rtx (mode
);
1679 rtx ops
[] = {tmp
, src
};
1680 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1683 emit_move_insn (tmp
, src
);
1687 if (satisfies_constraint_vu (src
))
1690 rtx ops
[] = {dest
, src
};
1691 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1695 /* VTYPE information for machine_mode. */
1696 struct mode_vtype_group
1698 enum vlmul_type vlmul
[NUM_MACHINE_MODES
];
1699 uint8_t ratio
[NUM_MACHINE_MODES
];
1700 machine_mode subpart_mode
[NUM_MACHINE_MODES
];
1701 uint8_t nf
[NUM_MACHINE_MODES
];
1704 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1705 vlmul[MODE##mode] = VLMUL; \
1706 ratio[MODE##mode] = RATIO;
1707 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1708 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1709 nf[MODE##mode] = NF; \
1710 vlmul[MODE##mode] = VLMUL; \
1711 ratio[MODE##mode] = RATIO;
1712 #include "riscv-vector-switch.def"
1718 static mode_vtype_group mode_vtype_infos
;
1720 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1722 get_vlmul (machine_mode mode
)
1724 /* For VLS modes, the vlmul should be dynamically
1725 calculated since we need to adjust VLMUL according
1726 to TARGET_MIN_VLEN. */
1727 if (riscv_v_ext_vls_mode_p (mode
))
1729 int size
= GET_MODE_BITSIZE (mode
).to_constant ();
1730 int inner_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
1731 if (size
< TARGET_MIN_VLEN
)
1733 int factor
= TARGET_MIN_VLEN
/ size
;
1734 if (inner_size
== 8)
1735 factor
= MIN (factor
, 8);
1736 else if (inner_size
== 16)
1737 factor
= MIN (factor
, 4);
1738 else if (inner_size
== 32)
1739 factor
= MIN (factor
, 2);
1740 else if (inner_size
== 64)
1741 factor
= MIN (factor
, 1);
1762 int factor
= size
/ TARGET_MIN_VLEN
;
1779 return mode_vtype_infos
.vlmul
[mode
];
1782 /* Return the VLMAX rtx of vector mode MODE. */
1784 get_vlmax_rtx (machine_mode mode
)
1786 gcc_assert (riscv_v_ext_vector_mode_p (mode
));
1787 return gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
);
1790 /* Return the NF value of the corresponding mode. */
1792 get_nf (machine_mode mode
)
1794 /* We don't allow non-tuple modes go through this function. */
1795 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1796 return mode_vtype_infos
.nf
[mode
];
1799 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1800 the subpart mode is RVVM2SImode. This will help to build
1801 array/struct type in builtins. */
1803 get_subpart_mode (machine_mode mode
)
1805 /* We don't allow non-tuple modes go through this function. */
1806 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1807 return mode_vtype_infos
.subpart_mode
[mode
];
1810 /* Get ratio according to machine mode. */
1812 get_ratio (machine_mode mode
)
1814 if (riscv_v_ext_vls_mode_p (mode
))
1816 unsigned int sew
= get_sew (mode
);
1817 vlmul_type vlmul
= get_vlmul (mode
);
1839 return mode_vtype_infos
.ratio
[mode
];
1842 /* Get ta according to operand[tail_op_idx]. */
1846 if (INTVAL (ta
) == TAIL_ANY
)
1847 return INVALID_ATTRIBUTE
;
1851 /* Get ma according to operand[mask_op_idx]. */
1855 if (INTVAL (ma
) == MASK_ANY
)
1856 return INVALID_ATTRIBUTE
;
1860 /* Get prefer tail policy. */
1862 get_prefer_tail_policy ()
1864 /* TODO: By default, we choose to use TAIL_ANY which allows
1865 compiler pick up either agnostic or undisturbed. Maybe we
1866 will have a compile option like -mprefer=agnostic to set
1871 /* Get prefer mask policy. */
1873 get_prefer_mask_policy ()
1875 /* TODO: By default, we choose to use MASK_ANY which allows
1876 compiler pick up either agnostic or undisturbed. Maybe we
1877 will have a compile option like -mprefer=agnostic to set
1882 /* Get avl_type rtx. */
1884 get_avl_type_rtx (enum avl_type type
)
1886 return gen_int_mode (type
, Pmode
);
1889 /* Return the appropriate mask mode for MODE. */
1892 get_mask_mode (machine_mode mode
)
1894 poly_int64 nunits
= GET_MODE_NUNITS (mode
);
1895 if (riscv_v_ext_tuple_mode_p (mode
))
1897 unsigned int nf
= get_nf (mode
);
1898 nunits
= exact_div (nunits
, nf
);
1900 return get_vector_mode (BImode
, nunits
).require ();
1903 /* Return the appropriate LMUL mode for MODE. */
1906 get_lmul_mode (scalar_mode mode
, int lmul
)
1908 poly_uint64 lmul_nunits
;
1909 unsigned int bytes
= GET_MODE_SIZE (mode
);
1910 if (multiple_p (BYTES_PER_RISCV_VECTOR
* lmul
, bytes
, &lmul_nunits
))
1911 return get_vector_mode (mode
, lmul_nunits
);
1915 /* Return the appropriate M1 mode for MODE. */
1917 static opt_machine_mode
1918 get_m1_mode (machine_mode mode
)
1920 scalar_mode smode
= GET_MODE_INNER (mode
);
1921 unsigned int bytes
= GET_MODE_SIZE (smode
);
1922 poly_uint64 m1_nunits
= exact_div (BYTES_PER_RISCV_VECTOR
, bytes
);
1923 return get_vector_mode (smode
, m1_nunits
);
1926 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1927 This function is not only used by builtins, but also will be used by
1928 auto-vectorization in the future. */
1930 get_vector_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1932 enum mode_class mclass
;
1933 if (inner_mode
== E_BImode
)
1934 mclass
= MODE_VECTOR_BOOL
;
1935 else if (FLOAT_MODE_P (inner_mode
))
1936 mclass
= MODE_VECTOR_FLOAT
;
1938 mclass
= MODE_VECTOR_INT
;
1940 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1941 if (inner_mode
== GET_MODE_INNER (mode
)
1942 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1943 && (riscv_v_ext_vector_mode_p (mode
)
1944 || riscv_v_ext_vls_mode_p (mode
)))
1946 return opt_machine_mode ();
1949 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1950 corresponding subpart mode and NF. */
1952 get_tuple_mode (machine_mode subpart_mode
, unsigned int nf
)
1954 poly_uint64 nunits
= GET_MODE_NUNITS (subpart_mode
) * nf
;
1955 scalar_mode inner_mode
= GET_MODE_INNER (subpart_mode
);
1956 enum mode_class mclass
= GET_MODE_CLASS (subpart_mode
);
1958 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1959 if (inner_mode
== GET_MODE_INNER (mode
)
1960 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1961 && riscv_v_ext_tuple_mode_p (mode
)
1962 && get_subpart_mode (mode
) == subpart_mode
)
1964 return opt_machine_mode ();
1970 if (!CONST_INT_P (x
))
1972 return IN_RANGE (INTVAL (x
), -16, 15);
1978 if (!CONST_INT_P (x
))
1980 return IN_RANGE (INTVAL (x
), -15, 16);
1984 has_vi_variant_p (rtx_code code
, rtx x
)
2008 return neg_simm5_p (x
);
2016 sew64_scalar_helper (rtx
*operands
, rtx
*scalar_op
, rtx vl
,
2017 machine_mode vector_mode
, bool has_vi_variant_p
,
2018 void (*emit_vector_func
) (rtx
*, rtx
), enum avl_type type
)
2020 machine_mode scalar_mode
= GET_MODE_INNER (vector_mode
);
2021 if (has_vi_variant_p
)
2023 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2029 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
2030 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2034 if (immediate_operand (*scalar_op
, Pmode
))
2036 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
2037 *scalar_op
= force_reg (Pmode
, *scalar_op
);
2039 *scalar_op
= gen_rtx_SIGN_EXTEND (scalar_mode
, *scalar_op
);
2043 if (CONST_INT_P (*scalar_op
))
2045 if (maybe_gt (GET_MODE_SIZE (scalar_mode
), GET_MODE_SIZE (Pmode
)))
2046 *scalar_op
= force_const_mem (scalar_mode
, *scalar_op
);
2048 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2051 rtx tmp
= gen_reg_rtx (vector_mode
);
2052 rtx ops
[] = {tmp
, *scalar_op
};
2054 emit_vlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
);
2056 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
,
2058 emit_vector_func (operands
, tmp
);
2063 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
2065 gen_scalar_move_mask (machine_mode mode
)
2067 rtx_vector_builder
builder (mode
, 1, 2);
2068 builder
.quick_push (const1_rtx
);
2069 builder
.quick_push (const0_rtx
);
2070 return builder
.build ();
2074 compute_vlmax (unsigned vector_bits
, unsigned elt_size
, unsigned min_size
)
2076 // Original equation:
2077 // VLMAX = (VectorBits / EltSize) * LMUL
2078 // where LMUL = MinSize / TARGET_MIN_VLEN
2079 // The following equations have been reordered to prevent loss of precision
2080 // when calculating fractional LMUL.
2081 return ((vector_bits
/ elt_size
) * min_size
) / TARGET_MIN_VLEN
;
2085 get_unknown_min_value (machine_mode mode
)
2087 enum vlmul_type vlmul
= get_vlmul (mode
);
2091 return TARGET_MIN_VLEN
;
2093 return TARGET_MIN_VLEN
* 2;
2095 return TARGET_MIN_VLEN
* 4;
2097 return TARGET_MIN_VLEN
* 8;
2104 force_vector_length_operand (rtx vl
)
2106 if (CONST_INT_P (vl
) && !satisfies_constraint_K (vl
))
2107 return force_reg (Pmode
, vl
);
2112 gen_no_side_effects_vsetvl_rtx (machine_mode vmode
, rtx vl
, rtx avl
)
2114 unsigned int sew
= get_sew (vmode
);
2115 rtx tail_policy
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2116 rtx mask_policy
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2117 return gen_vsetvl_no_side_effects (Pmode
, vl
, avl
, gen_int_mode (sew
, Pmode
),
2118 gen_int_mode (get_vlmul (vmode
), Pmode
),
2119 tail_policy
, mask_policy
);
2122 /* GET VL * 2 rtx. */
2124 get_vl_x2_rtx (rtx avl
, machine_mode mode
, machine_mode demote_mode
)
2126 rtx i32vl
= NULL_RTX
;
2127 if (CONST_INT_P (avl
))
2129 unsigned elt_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
2130 unsigned min_size
= get_unknown_min_value (mode
);
2131 unsigned vlen_max
= RVV_65536
;
2132 unsigned vlmax_max
= compute_vlmax (vlen_max
, elt_size
, min_size
);
2133 unsigned vlen_min
= TARGET_MIN_VLEN
;
2134 unsigned vlmax_min
= compute_vlmax (vlen_min
, elt_size
, min_size
);
2136 unsigned HOST_WIDE_INT avl_int
= INTVAL (avl
);
2137 if (avl_int
<= vlmax_min
)
2138 i32vl
= gen_int_mode (2 * avl_int
, Pmode
);
2139 else if (avl_int
>= 2 * vlmax_max
)
2141 // Just set i32vl to VLMAX in this situation
2142 i32vl
= gen_reg_rtx (Pmode
);
2144 gen_no_side_effects_vsetvl_rtx (demote_mode
, i32vl
, RVV_VLMAX
));
2148 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2149 // is related to the hardware implementation.
2150 // So let the following code handle
2155 // Using vsetvli instruction to get actually used length which related to
2156 // the hardware implementation
2157 rtx i64vl
= gen_reg_rtx (Pmode
);
2159 gen_no_side_effects_vsetvl_rtx (mode
, i64vl
, force_reg (Pmode
, avl
)));
2160 // scale 2 for 32-bit length
2161 i32vl
= gen_reg_rtx (Pmode
);
2163 gen_rtx_SET (i32vl
, gen_rtx_ASHIFT (Pmode
, i64vl
, const1_rtx
)));
2166 return force_vector_length_operand (i32vl
);
2170 slide1_sew64_helper (int unspec
, machine_mode mode
, machine_mode demote_mode
,
2171 machine_mode demote_mask_mode
, rtx
*ops
)
2173 rtx scalar_op
= ops
[4];
2175 machine_mode scalar_mode
= GET_MODE_INNER (mode
);
2176 if (rtx_equal_p (scalar_op
, const0_rtx
))
2178 ops
[5] = force_vector_length_operand (ops
[5]);
2184 ops
[4] = force_reg (scalar_mode
, scalar_op
);
2185 ops
[5] = force_vector_length_operand (ops
[5]);
2189 if (immediate_operand (scalar_op
, Pmode
))
2191 ops
[4] = gen_rtx_SIGN_EXTEND (scalar_mode
, force_reg (Pmode
, scalar_op
));
2192 ops
[5] = force_vector_length_operand (ops
[5]);
2196 if (CONST_INT_P (scalar_op
))
2197 scalar_op
= force_reg (scalar_mode
, scalar_op
);
2199 rtx vl_x2
= get_vl_x2_rtx (avl
, mode
, demote_mode
);
2201 rtx demote_scalar_op1
, demote_scalar_op2
;
2202 if (unspec
== UNSPEC_VSLIDE1UP
)
2204 demote_scalar_op1
= gen_highpart (Pmode
, scalar_op
);
2205 demote_scalar_op2
= gen_lowpart (Pmode
, scalar_op
);
2209 demote_scalar_op1
= gen_lowpart (Pmode
, scalar_op
);
2210 demote_scalar_op2
= gen_highpart (Pmode
, scalar_op
);
2213 rtx temp
= gen_reg_rtx (demote_mode
);
2214 rtx ta
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2215 rtx ma
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2216 rtx merge
= RVV_VUNDEF (demote_mode
);
2217 /* Handle vslide1<ud>_tu. */
2218 if (register_operand (ops
[2], mode
)
2219 && rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1]))))
2221 merge
= gen_lowpart (demote_mode
, ops
[2]);
2226 emit_insn (gen_pred_slide (unspec
, demote_mode
, temp
,
2227 CONSTM1_RTX (demote_mask_mode
), merge
,
2228 gen_lowpart (demote_mode
, ops
[3]),
2229 demote_scalar_op1
, vl_x2
, ta
, ma
, ops
[8]));
2230 emit_insn (gen_pred_slide (unspec
, demote_mode
,
2231 gen_lowpart (demote_mode
, ops
[0]),
2232 CONSTM1_RTX (demote_mask_mode
), merge
, temp
,
2233 demote_scalar_op2
, vl_x2
, ta
, ma
, ops
[8]));
2235 if (!rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1])))
2236 && !rtx_equal_p (ops
[2], RVV_VUNDEF (GET_MODE (ops
[2]))))
2237 emit_insn (gen_pred_merge (mode
, ops
[0], ops
[2], ops
[2], ops
[0], ops
[1],
2238 force_vector_length_operand (ops
[5]), ops
[6],
2244 gen_avl_for_scalar_move (rtx avl
)
2246 /* AVL for scalar move has different behavior between 0 and large than 0. */
2247 if (CONST_INT_P (avl
))
2249 /* So we could just set AVL to 1 for any constant other than 0. */
2250 if (rtx_equal_p (avl
, const0_rtx
))
2257 /* For non-constant value, we set any non zero value to 1 by
2258 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2259 rtx tmp
= gen_reg_rtx (Pmode
);
2261 gen_rtx_SET (tmp
, gen_rtx_fmt_ee (GTU
, Pmode
, avl
, const0_rtx
)));
2266 /* Expand tuple modes data movement for. */
2268 expand_tuple_move (rtx
*ops
)
2271 machine_mode tuple_mode
= GET_MODE (ops
[0]);
2272 machine_mode subpart_mode
= get_subpart_mode (tuple_mode
);
2273 poly_int64 subpart_size
= GET_MODE_SIZE (subpart_mode
);
2274 unsigned int nf
= get_nf (tuple_mode
);
2275 bool fractional_p
= known_lt (subpart_size
, BYTES_PER_RISCV_VECTOR
);
2277 if (REG_P (ops
[0]) && CONST_VECTOR_P (ops
[1]))
2280 gcc_assert (can_create_pseudo_p ()
2281 && const_vec_duplicate_p (ops
[1], &val
));
2282 for (i
= 0; i
< nf
; ++i
)
2284 poly_int64 offset
= i
* subpart_size
;
2286 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2287 rtx dup
= gen_const_vec_duplicate (subpart_mode
, val
);
2288 emit_move_insn (subreg
, dup
);
2291 else if (REG_P (ops
[0]) && REG_P (ops
[1]))
2293 for (i
= 0; i
< nf
; ++i
)
2297 /* Take NF = 2 and LMUL = 1 for example:
2306 if (REGNO (ops
[0]) > REGNO (ops
[1]))
2308 poly_int64 offset
= index
* subpart_size
;
2310 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2312 = simplify_gen_subreg (subpart_mode
, ops
[1], tuple_mode
, offset
);
2313 emit_insn (gen_rtx_SET (dst_subreg
, src_subreg
));
2318 /* Expand tuple memory data movement. */
2319 gcc_assert (MEM_P (ops
[0]) || MEM_P (ops
[1]));
2320 rtx offset
= gen_int_mode (subpart_size
, Pmode
);
2321 if (!subpart_size
.is_constant ())
2323 emit_move_insn (ops
[2], gen_int_mode (BYTES_PER_RISCV_VECTOR
, Pmode
));
2327 = exact_div (BYTES_PER_RISCV_VECTOR
, subpart_size
)
2330 = gen_rtx_ASHIFTRT (Pmode
, ops
[2],
2331 gen_int_mode (exact_log2 (factor
), Pmode
));
2332 emit_insn (gen_rtx_SET (ops
[2], pat
));
2335 if (known_gt (subpart_size
, BYTES_PER_RISCV_VECTOR
))
2338 = exact_div (subpart_size
, BYTES_PER_RISCV_VECTOR
)
2341 = gen_rtx_ASHIFT (Pmode
, ops
[2],
2342 gen_int_mode (exact_log2 (factor
), Pmode
));
2343 emit_insn (gen_rtx_SET (ops
[2], pat
));
2348 /* Non-fractional LMUL has whole register moves that don't require a
2349 vsetvl for VLMAX. */
2351 emit_vlmax_vsetvl (subpart_mode
, ops
[4]);
2354 /* Load operations. */
2355 emit_move_insn (ops
[3], XEXP (ops
[1], 0));
2356 for (i
= 0; i
< nf
; i
++)
2358 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[0],
2359 tuple_mode
, i
* subpart_size
);
2362 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2363 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2365 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2369 rtx operands
[] = {subreg
, mem
};
2370 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2371 UNARY_OP
, operands
, ops
[4]);
2374 emit_move_insn (subreg
, mem
);
2379 /* Store operations. */
2380 emit_move_insn (ops
[3], XEXP (ops
[0], 0));
2381 for (i
= 0; i
< nf
; i
++)
2383 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[1],
2384 tuple_mode
, i
* subpart_size
);
2387 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2388 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2390 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2394 rtx operands
[] = {mem
, subreg
};
2395 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2396 UNARY_OP
, operands
, ops
[4]);
2399 emit_move_insn (mem
, subreg
);
2405 /* Return the vectorization machine mode for RVV according to LMUL. */
2407 preferred_simd_mode (scalar_mode mode
)
2409 if (autovec_use_vlmax_p ())
2411 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2412 rvv_max_lmul as multiply factor to calculate the NUNITS to
2413 get the auto-vectorization mode. */
2415 poly_uint64 vector_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2416 poly_uint64 scalar_size
= GET_MODE_SIZE (mode
);
2417 /* Disable vectorization when we can't find a RVV mode for it.
2418 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2419 a double (DFmode) type. */
2420 if (!multiple_p (vector_size
, scalar_size
, &nunits
))
2422 machine_mode rvv_mode
;
2423 if (get_vector_mode (mode
, nunits
).exists (&rvv_mode
))
2429 /* Use merge approach to initialize the vector with repeating sequence.
2430 v = {a, b, a, b, a, b, a, b}.
2433 mask = 0b01010101....
2434 v = merge (v, b, mask)
2437 expand_vector_init_merge_repeating_sequence (rtx target
,
2438 const rvv_builder
&builder
)
2440 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2441 since we don't have such instruction in RVV.
2442 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2443 instruction to generate the mask data we want. */
2444 machine_mode mask_bit_mode
= get_mask_mode (builder
.mode ());
2445 machine_mode mask_int_mode
2446 = get_repeating_sequence_dup_machine_mode (builder
, mask_bit_mode
);
2447 uint64_t full_nelts
= builder
.full_nelts ().to_constant ();
2449 /* Step 1: Broadcast the first pattern. */
2450 rtx ops
[] = {target
, force_reg (builder
.inner_mode (), builder
.elt (0))};
2451 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()),
2453 /* Step 2: Merge the rest iteration of pattern. */
2454 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
2456 /* Step 2-1: Generate mask register v0 for each merge. */
2458 = builder
.get_merge_scalar_mask (i
, GET_MODE_INNER (mask_int_mode
));
2459 rtx mask
= gen_reg_rtx (mask_bit_mode
);
2460 rtx dup
= gen_reg_rtx (mask_int_mode
);
2462 if (full_nelts
<= builder
.inner_bits_size ()) /* vmv.s.x. */
2464 rtx ops
[] = {dup
, merge_mask
};
2465 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup
)),
2466 SCALAR_MOVE_OP
, ops
, CONST1_RTX (Pmode
));
2471 force_reg (GET_MODE_INNER (mask_int_mode
), merge_mask
)};
2472 rtx vl
= gen_int_mode (CEIL (full_nelts
, builder
.inner_bits_size ()),
2474 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode
), UNARY_OP
,
2478 emit_move_insn (mask
, gen_lowpart (mask_bit_mode
, dup
));
2480 /* Step 2-2: Merge pattern according to the mask. */
2481 rtx ops
[] = {target
, target
, builder
.elt (i
), mask
};
2482 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target
)),
2487 /* Use slideup approach to combine the vectors.
2488 v = {a, a, a, a, b, b, b, b}
2491 v1 = {a, a, a, a, a, a, a, a}
2492 v2 = {b, b, b, b, b, b, b, b}
2493 v = slideup (v1, v2, nelt / 2)
2496 expand_vector_init_slideup_combine_sequence (rtx target
,
2497 const rvv_builder
&builder
)
2499 machine_mode mode
= GET_MODE (target
);
2500 int nelts
= builder
.full_nelts ().to_constant ();
2501 rtx first_elt
= builder
.elt (0);
2502 rtx last_elt
= builder
.elt (nelts
- 1);
2503 rtx low
= expand_vector_broadcast (mode
, first_elt
);
2504 rtx high
= expand_vector_broadcast (mode
, last_elt
);
2505 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, mode
);
2506 rtx ops
[] = {target
, low
, high
, gen_int_mode (nelts
/ 2, Pmode
)};
2507 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
2510 /* Use merge approach to merge a scalar into a vector.
2511 v = {a, a, a, a, a, a, b, b}
2513 v1 = {a, a, a, a, a, a, a, a}
2515 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2518 expand_vector_init_merge_combine_sequence (rtx target
,
2519 const rvv_builder
&builder
)
2521 machine_mode mode
= GET_MODE (target
);
2522 machine_mode imode
= builder
.int_mode ();
2523 machine_mode mmode
= builder
.mask_mode ();
2524 int nelts
= builder
.full_nelts ().to_constant ();
2525 int leading_ndups
= builder
.count_dups (0, nelts
- 1, 1);
2526 if ((leading_ndups
> 255 && GET_MODE_INNER (imode
) == QImode
)
2527 || riscv_get_v_regno_alignment (imode
) > 1)
2528 imode
= get_vector_mode (HImode
, nelts
).require ();
2530 /* Generate vid = { 0, 1, 2, ..., n }. */
2531 rtx vid
= gen_reg_rtx (imode
);
2532 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
2534 /* Generate mask. */
2535 rtx mask
= gen_reg_rtx (mmode
);
2536 insn_code icode
= code_for_pred_cmp_scalar (imode
);
2537 rtx index
= gen_int_mode (leading_ndups
- 1, builder
.inner_int_mode ());
2538 rtx dup_rtx
= gen_rtx_VEC_DUPLICATE (imode
, index
);
2539 /* vmsgtu.vi/vmsgtu.vx. */
2540 rtx cmp
= gen_rtx_fmt_ee (GTU
, mmode
, vid
, dup_rtx
);
2541 rtx sel
= builder
.elt (nelts
- 1);
2542 rtx mask_ops
[] = {mask
, cmp
, vid
, index
};
2543 emit_vlmax_insn (icode
, COMPARE_OP
, mask_ops
);
2545 /* Duplicate the first elements. */
2546 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
2547 /* Merge scalar into vector according to mask. */
2548 rtx merge_ops
[] = {target
, dup
, sel
, mask
};
2549 icode
= code_for_pred_merge_scalar (mode
);
2550 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
2553 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2556 expand_vec_init (rtx target
, rtx vals
)
2558 machine_mode mode
= GET_MODE (target
);
2559 int nelts
= XVECLEN (vals
, 0);
2561 rvv_builder
v (mode
, nelts
, 1);
2562 for (int i
= 0; i
< nelts
; i
++)
2563 v
.quick_push (XVECEXP (vals
, 0, i
));
2566 /* If the sequence is v = { a, a, a, a } just broadcast an element. */
2567 if (v
.is_repeating_sequence ())
2569 machine_mode mode
= GET_MODE (target
);
2570 rtx dup
= expand_vector_broadcast (mode
, v
.elt (0));
2571 emit_move_insn (target
, dup
);
2577 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2578 if (v
.can_duplicate_repeating_sequence_p ())
2580 rtx ele
= v
.get_merged_repeating_sequence ();
2581 rtx dup
= expand_vector_broadcast (v
.new_mode (), ele
);
2582 emit_move_insn (target
, gen_lowpart (mode
, dup
));
2586 /* Case 2: Optimize repeating sequence cases that Case 1 can
2587 not handle and it is profitable. For example:
2588 ELEMENT BITSIZE = 64.
2589 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2590 We can't find a vector mode for "ab" which will be combined into
2591 128-bit element to duplicate. */
2592 if (v
.repeating_sequence_use_merge_profitable_p ())
2594 expand_vector_init_merge_repeating_sequence (target
, v
);
2598 /* Case 3: Optimize combine sequence.
2599 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2601 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2603 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2605 if (v
.combine_sequence_use_slideup_profitable_p ())
2607 expand_vector_init_slideup_combine_sequence (target
, v
);
2611 /* Case 4: Optimize combine sequence.
2612 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2615 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2618 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2620 Merge b into v by mask:
2621 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2622 if (v
.combine_sequence_use_merge_profitable_p ())
2624 expand_vector_init_merge_combine_sequence (target
, v
);
2629 /* Optimize trailing same elements sequence:
2630 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2631 if (!expand_vector_init_trailing_same_elem (target
, v
, nelts
))
2632 /* Handle common situation by vslide1down. This function can handle any
2633 situation of vec_init<mode>. Only the cases that are not optimized above
2634 will fall through here. */
2635 expand_vector_init_insert_elems (target
, v
, nelts
);
2638 /* Get insn code for corresponding comparison. */
2641 get_cmp_insn_code (rtx_code code
, machine_mode mode
)
2653 icode
= code_for_pred_cmp (mode
);
2659 if (FLOAT_MODE_P (mode
))
2660 icode
= code_for_pred_cmp (mode
);
2662 icode
= code_for_pred_ltge (mode
);
2670 /* This hook gives the vectorizer more vector mode options. We want it to not
2671 only try modes with the maximum number of units a full vector can hold but
2672 for example also half the number of units for a smaller elements size.
2673 Such vectors can be promoted to a full vector of widened elements
2674 (still with the same number of elements, essentially vectorizing at a
2675 fixed number of units rather than a fixed number of bytes). */
2677 autovectorize_vector_modes (vector_modes
*modes
, bool)
2679 if (autovec_use_vlmax_p ())
2681 poly_uint64 full_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2683 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2685 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2686 is guided by the extensions we have available (vf2, vf4 and vf8).
2688 - full_size: Try using full vectors for all element types.
2690 Try using 16-bit containers for 8-bit elements and full vectors
2693 Try using 32-bit containers for 8-bit and 16-bit elements and
2694 full vectors for wider elements.
2696 Try using 64-bit containers for all element types. */
2697 static const int rvv_factors
[] = {1, 2, 4, 8, 16, 32, 64};
2698 for (unsigned int i
= 0; i
< sizeof (rvv_factors
) / sizeof (int); i
++)
2702 if (can_div_trunc_p (full_size
, rvv_factors
[i
], &units
)
2703 && get_vector_mode (QImode
, units
).exists (&mode
))
2704 modes
->safe_push (mode
);
2707 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2709 unsigned int base_size
= TARGET_MIN_VLEN
* TARGET_MAX_LMUL
/ 8;
2710 unsigned int size
= base_size
;
2712 while (size
> 0 && get_vector_mode (QImode
, size
).exists (&mode
))
2714 if (vls_mode_valid_p (mode
))
2715 modes
->safe_push (mode
);
2718 size
= base_size
/ (1U << i
);
2720 /* Enable LOOP_VINFO comparison in COST model. */
2721 return VECT_COMPARE_COSTS
;
2724 /* Return true if we can find the related MODE according to default LMUL. */
2726 can_find_related_mode_p (machine_mode vector_mode
, scalar_mode element_mode
,
2727 poly_uint64
*nunits
)
2729 if (!autovec_use_vlmax_p ())
2731 if (riscv_v_ext_vector_mode_p (vector_mode
)
2732 && multiple_p (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
2733 GET_MODE_SIZE (element_mode
), nunits
))
2735 if (riscv_v_ext_vls_mode_p (vector_mode
)
2736 && multiple_p (TARGET_MIN_VLEN
* TARGET_MAX_LMUL
,
2737 GET_MODE_SIZE (element_mode
), nunits
))
2742 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2743 of units that fit into a full vector at the given ELEMENT_MODE.
2744 We will have the vectorizer call us with a successively decreasing
2745 number of units (as specified in autovectorize_vector_modes).
2746 The starting mode is always the one specified by preferred_simd_mode. */
2748 vectorize_related_mode (machine_mode vector_mode
, scalar_mode element_mode
,
2751 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2752 poly_uint64 min_units
;
2753 if (can_find_related_mode_p (vector_mode
, element_mode
, &min_units
))
2755 machine_mode rvv_mode
;
2756 if (maybe_ne (nunits
, 0U))
2758 /* If we were given a number of units NUNITS, try to find an
2759 RVV vector mode of inner mode ELEMENT_MODE with the same
2761 if (multiple_p (min_units
, nunits
)
2762 && get_vector_mode (element_mode
, nunits
).exists (&rvv_mode
))
2767 /* Look for a vector mode with the same number of units as the
2768 VECTOR_MODE we were given. We keep track of the minimum
2769 number of units so far which determines the smallest necessary
2770 but largest possible, suitable mode for vectorization. */
2771 min_units
= ordered_min (min_units
, GET_MODE_SIZE (vector_mode
));
2772 if (get_vector_mode (element_mode
, min_units
).exists (&rvv_mode
))
2777 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2780 /* Expand an RVV comparison. */
2783 expand_vec_cmp (rtx target
, rtx_code code
, rtx op0
, rtx op1
, rtx mask
,
2786 machine_mode mask_mode
= GET_MODE (target
);
2787 machine_mode data_mode
= GET_MODE (op0
);
2788 insn_code icode
= get_cmp_insn_code (code
, data_mode
);
2792 rtx lt
= gen_reg_rtx (mask_mode
);
2793 rtx gt
= gen_reg_rtx (mask_mode
);
2794 expand_vec_cmp (lt
, LT
, op0
, op1
, mask
, maskoff
);
2795 expand_vec_cmp (gt
, GT
, op0
, op1
, mask
, maskoff
);
2796 icode
= code_for_pred (IOR
, mask_mode
);
2797 rtx ops
[] = {target
, lt
, gt
};
2798 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2802 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, op0
, op1
);
2803 if (!mask
&& !maskoff
)
2805 rtx ops
[] = {target
, cmp
, op0
, op1
};
2806 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
2810 rtx ops
[] = {target
, mask
, maskoff
, cmp
, op0
, op1
};
2811 emit_vlmax_insn (icode
, COMPARE_OP_MU
, ops
);
2815 /* Expand an RVV floating-point comparison:
2817 If CAN_INVERT_P is true, the caller can also handle inverted results;
2818 return true if the result is in fact inverted. */
2821 expand_vec_cmp_float (rtx target
, rtx_code code
, rtx op0
, rtx op1
,
2824 machine_mode mask_mode
= GET_MODE (target
);
2825 machine_mode data_mode
= GET_MODE (op0
);
2827 /* If can_invert_p = true:
2828 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2833 vmflt.vv v0, va, vb, v0.t
2836 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2840 vmfeq.vv v0, vb, vb, v0.t
2841 vmflt.vv v0, va, vb, v0.t
2844 If can_invert_p = false:
2846 # Example of implementing isgreater()
2847 vmfeq.vv v0, va, va # Only set where A is not NaN.
2848 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2849 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2850 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2853 rtx eq0
= gen_reg_rtx (mask_mode
);
2854 rtx eq1
= gen_reg_rtx (mask_mode
);
2864 /* There is native support for the comparison. */
2865 expand_vec_cmp (target
, code
, op0
, op1
);
2874 /* vmfeq.vv v0, va, va */
2875 expand_vec_cmp (eq0
, EQ
, op0
, op0
);
2876 if (HONOR_SNANS (data_mode
))
2882 expand_vec_cmp (eq1
, EQ
, op1
, op1
);
2883 insn_code icode
= code_for_pred (AND
, mask_mode
);
2884 rtx ops
[] = {eq0
, eq0
, eq1
};
2885 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2889 /* vmfeq.vv v0, vb, vb, v0.t */
2890 expand_vec_cmp (eq0
, EQ
, op1
, op1
, eq0
, eq0
);
2897 if (code
== ORDERED
)
2899 emit_move_insn (target
, eq0
);
2903 /* There is native support for the inverse comparison. */
2904 code
= reverse_condition_maybe_unordered (code
);
2905 if (code
== ORDERED
)
2906 emit_move_insn (target
, eq0
);
2908 expand_vec_cmp (eq0
, code
, op0
, op1
, eq0
, eq0
);
2912 emit_move_insn (target
, eq0
);
2916 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2917 into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm. */
2918 emit_insn (gen_rtx_SET (target
, gen_rtx_NOT (mask_mode
, eq0
)));
2922 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2923 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2926 modulo_sel_indices (rtx op0
, rtx op1
, rtx sel
)
2929 machine_mode sel_mode
= GET_MODE (sel
);
2930 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2931 poly_uint64 max_sel
= rtx_equal_p (op0
, op1
) ? nunits
- 1 : 2 * nunits
- 1;
2932 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2933 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2935 if (CONST_VECTOR_P (sel
)
2936 && (!nunits
.is_constant () || const_vec_all_in_range_p (sel
, 0, max_sel
)))
2940 rtx mod
= gen_const_vector_dup (sel_mode
, max_sel
);
2942 = expand_simple_binop (sel_mode
, AND
, sel
, mod
, NULL
, 0, OPTAB_DIRECT
);
2947 /* Implement vec_perm<mode>. */
2950 expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
2952 machine_mode data_mode
= GET_MODE (target
);
2953 machine_mode sel_mode
= GET_MODE (sel
);
2954 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2956 /* Check if the sel only references the first values vector. If each select
2957 index is in range of [0, nunits - 1]. A single vrgather instructions is
2958 enough. Since we will use vrgatherei16.vv for variable-length vector,
2959 it is never out of range and we don't need to modulo the index. */
2960 if (nunits
.is_constant () && const_vec_all_in_range_p (sel
, 0, nunits
- 1))
2962 emit_vlmax_gather_insn (target
, op0
, sel
);
2966 /* Check if all the indices are same. */
2968 if (const_vec_duplicate_p (sel
, &elt
))
2970 poly_uint64 value
= rtx_to_poly_int64 (elt
);
2972 if (maybe_gt (value
, nunits
- 1))
2974 sel
= gen_const_vector_dup (sel_mode
, value
- nunits
);
2977 emit_vlmax_gather_insn (target
, op
, sel
);
2980 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2981 size of the two value vectors, i.e. the upper bits of the indices
2982 are effectively ignored. RVV vrgather instead produces 0 for any
2983 out-of-range indices, so we need to modulo all the vec_perm indices
2984 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2985 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2986 rtx sel_mod
= modulo_sel_indices (op0
, op1
, sel
);
2988 /* Check if the two values vectors are the same. */
2989 if (rtx_equal_p (op0
, op1
))
2991 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
2995 /* This following sequence is handling the case that:
2996 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2997 value in range of [0, 2 * nunits - 1]. */
2998 machine_mode mask_mode
;
2999 mask_mode
= get_mask_mode (data_mode
);
3000 rtx mask
= gen_reg_rtx (mask_mode
);
3001 rtx max_sel
= gen_const_vector_dup (sel_mode
, nunits
);
3003 /* Step 1: generate a mask that should select everything >= nunits into the
3005 expand_vec_cmp (mask
, GEU
, sel_mod
, max_sel
);
3007 /* Step2: gather every op0 values indexed by sel into target,
3008 we don't need to care about the result of the element
3009 whose index >= nunits. */
3010 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
3012 /* Step3: shift the range from (nunits, max_of_mode] to
3013 [0, max_of_mode - nunits]. */
3014 rtx tmp
= gen_reg_rtx (sel_mode
);
3015 rtx ops
[] = {tmp
, sel_mod
, max_sel
};
3016 emit_vlmax_insn (code_for_pred (MINUS
, sel_mode
), BINARY_OP
, ops
);
3018 /* Step4: gather those into the previously masked-out elements
3020 emit_vlmax_masked_gather_mu_insn (target
, op1
, tmp
, mask
);
3023 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3025 /* vec_perm support. */
3027 struct expand_vec_perm_d
3029 rtx target
, op0
, op1
;
3030 vec_perm_indices perm
;
3032 machine_mode op_mode
;
3037 /* Return the appropriate index mode for gather instructions. */
3039 get_gather_index_mode (struct expand_vec_perm_d
*d
)
3041 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3042 poly_uint64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3044 if (GET_MODE_INNER (d
->vmode
) == QImode
)
3046 if (nunits
.is_constant ())
3048 /* If indice is LMUL8 CONST_VECTOR and any element value
3049 exceed the range of 0 ~ 255, Forbid such permutation
3050 since we need vector HI mode to hold such indice and
3051 we don't have it. */
3052 if (!d
->perm
.all_in_range_p (0, 255)
3053 && !get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3054 return opt_machine_mode ();
3058 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3059 Otherwise, it could overflow the index range. */
3060 if (!get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3061 return opt_machine_mode ();
3064 else if (riscv_get_v_regno_alignment (sel_mode
) > 1
3065 && GET_MODE_INNER (sel_mode
) != HImode
)
3066 sel_mode
= get_vector_mode (HImode
, nunits
).require ();
3070 /* Recognize the patterns that we can use merge operation to shuffle the
3071 vectors. The value of Each element (index i) in selector can only be
3072 either i or nunits + i. We will check the pattern is actually monotonic.
3075 v = VEC_PERM_EXPR (v0, v1, selector),
3076 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3078 We can transform such pattern into:
3080 v = vcond_mask (v0, v1, mask),
3081 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3084 shuffle_merge_patterns (struct expand_vec_perm_d
*d
)
3086 machine_mode vmode
= d
->vmode
;
3087 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3088 int n_patterns
= d
->perm
.encoding ().npatterns ();
3089 poly_int64 vec_len
= d
->perm
.length ();
3091 for (int i
= 0; i
< n_patterns
; ++i
)
3092 if (!known_eq (d
->perm
[i
], i
) && !known_eq (d
->perm
[i
], vec_len
+ i
))
3095 /* Check the pattern is monotonic here, otherwise, return false. */
3096 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
3097 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
3098 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
3101 /* We need to use precomputed mask for such situation and such mask
3102 can only be computed in compile-time known size modes. */
3103 bool indices_fit_selector_p
3104 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode
)) > 8 || known_lt (vec_len
, 256);
3105 if (!indices_fit_selector_p
&& !vec_len
.is_constant ())
3111 machine_mode mask_mode
= get_mask_mode (vmode
);
3112 rtx mask
= gen_reg_rtx (mask_mode
);
3114 if (indices_fit_selector_p
&& vec_len
.is_constant ())
3116 /* For a constant vector length we can generate the needed mask at
3117 compile time and load it as mask at runtime.
3118 This saves a compare at runtime. */
3119 rtx_vector_builder
sel (mask_mode
, d
->perm
.encoding ().npatterns (),
3120 d
->perm
.encoding ().nelts_per_pattern ());
3121 unsigned int encoded_nelts
= sel
.encoded_nelts ();
3122 for (unsigned int i
= 0; i
< encoded_nelts
; i
++)
3123 sel
.quick_push (gen_int_mode (d
->perm
[i
].to_constant ()
3124 < vec_len
.to_constant (),
3125 GET_MODE_INNER (mask_mode
)));
3126 mask
= sel
.build ();
3128 else if (indices_fit_selector_p
)
3130 /* For a dynamic vector length < 256 we keep the permutation
3131 indices in the literal pool, load it at runtime and create the
3132 mask by selecting either OP0 or OP1 by
3134 INDICES < NUNITS ? 1 : 0. */
3135 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3136 rtx x
= gen_int_mode (vec_len
, GET_MODE_INNER (sel_mode
));
3137 insn_code icode
= code_for_pred_cmp_scalar (sel_mode
);
3138 rtx cmp
= gen_rtx_fmt_ee (LTU
, mask_mode
, sel
, x
);
3139 rtx ops
[] = {mask
, cmp
, sel
, x
};
3140 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
3144 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3145 directly to generate the selector mask, instead, we can only use
3148 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3149 don't have a QImode scalar register to hold larger than 255.
3150 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3151 since there is no larger HI mode vector we cannot create a larger
3154 As the mask is a simple {0, 1, ...} pattern and the length is known we
3155 can store it in a scalar register and broadcast it to a mask register.
3157 gcc_assert (vec_len
.is_constant ());
3158 int size
= CEIL (GET_MODE_NUNITS (mask_mode
).to_constant (), 8);
3159 machine_mode mode
= get_vector_mode (QImode
, size
).require ();
3160 rtx tmp
= gen_reg_rtx (mode
);
3161 rvv_builder
v (mode
, 1, size
);
3162 for (int i
= 0; i
< vec_len
.to_constant () / 8; i
++)
3165 for (int j
= 0; j
< 8; j
++)
3167 int index
= i
* 8 + j
;
3168 if (known_lt (d
->perm
[index
], 256))
3171 v
.quick_push (gen_int_mode (value
, QImode
));
3173 emit_move_insn (tmp
, v
.build ());
3174 emit_move_insn (mask
, gen_lowpart (mask_mode
, tmp
));
3177 /* TARGET = MASK ? OP0 : OP1. */
3178 /* swap op0 and op1 since the order is opposite to pred_merge. */
3179 rtx ops2
[] = {d
->target
, d
->op1
, d
->op0
, mask
};
3180 emit_vlmax_insn (code_for_pred_merge (vmode
), MERGE_OP
, ops2
);
3184 /* Recognize the consecutive index that we can use a single
3185 vrgather.v[x|i] to shuffle the vectors.
3187 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3188 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3190 shuffle_consecutive_patterns (struct expand_vec_perm_d
*d
)
3192 machine_mode vmode
= d
->vmode
;
3193 scalar_mode smode
= GET_MODE_INNER (vmode
);
3194 poly_int64 vec_len
= d
->perm
.length ();
3197 if (!vec_len
.is_constant () || !d
->perm
[0].is_constant (&elt
))
3199 int vlen
= vec_len
.to_constant ();
3201 /* Compute the last element index of consecutive pattern from the leading
3202 consecutive elements. */
3203 int last_consecutive_idx
= -1;
3204 int consecutive_num
= -1;
3205 for (int i
= 1; i
< vlen
; i
++)
3207 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3209 last_consecutive_idx
= i
;
3210 consecutive_num
= last_consecutive_idx
+ 1;
3213 int new_vlen
= vlen
/ consecutive_num
;
3214 if (last_consecutive_idx
< 0 || consecutive_num
== vlen
3215 || !pow2p_hwi (consecutive_num
) || !pow2p_hwi (new_vlen
))
3217 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3218 All elements of index, index + 1, ... index + consecutive_num - 1 should
3219 locate at the same vector. */
3220 if (maybe_ge (d
->perm
[0], vec_len
)
3221 != maybe_ge (d
->perm
[last_consecutive_idx
], vec_len
))
3223 /* If a vector has 8 elements. We allow optimizations on consecutive
3224 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3225 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3227 if (d
->perm
[0].to_constant () % consecutive_num
!= 0)
3229 unsigned int container_bits
= consecutive_num
* GET_MODE_BITSIZE (smode
);
3230 if (container_bits
> 64)
3232 else if (container_bits
== 64)
3234 if (!TARGET_VECTOR_ELEN_64
)
3236 else if (FLOAT_MODE_P (smode
) && !TARGET_VECTOR_ELEN_FP_64
)
3240 /* Check the rest of elements are the same consecutive pattern. */
3241 for (int i
= consecutive_num
; i
< vlen
; i
++)
3242 if (maybe_ne (d
->perm
[i
], d
->perm
[i
% consecutive_num
]))
3245 if (FLOAT_MODE_P (smode
))
3246 smode
= float_mode_for_size (container_bits
).require ();
3248 smode
= int_mode_for_size (container_bits
, 0).require ();
3249 if (!get_vector_mode (smode
, new_vlen
).exists (&vmode
))
3251 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3257 int index
= elt
/ consecutive_num
;
3258 if (index
>= new_vlen
)
3259 index
= index
- new_vlen
;
3260 rtx sel
= gen_const_vector_dup (sel_mode
, index
);
3261 rtx op
= elt
>= vlen
? d
->op0
: d
->op1
;
3262 emit_vlmax_gather_insn (gen_lowpart (vmode
, d
->target
),
3263 gen_lowpart (vmode
, op
), sel
);
3267 /* Recognize the patterns that we can use compress operation to shuffle the
3268 vectors. The perm selector of compress pattern is divided into 2 part:
3269 The first part is the random index number < NUNITS.
3270 The second part is consecutive last N index number >= NUNITS.
3273 v = VEC_PERM_EXPR (v0, v1, selector),
3274 selector = { 0, 2, 6, 7 }
3276 We can transform such pattern into:
3278 op1 = vcompress (op0, mask)
3279 mask = { 1, 0, 1, 0 }
3283 shuffle_compress_patterns (struct expand_vec_perm_d
*d
)
3285 machine_mode vmode
= d
->vmode
;
3286 poly_int64 vec_len
= d
->perm
.length ();
3288 if (!vec_len
.is_constant ())
3291 int vlen
= vec_len
.to_constant ();
3293 /* It's not worthwhile the compress pattern has elements < 4
3294 and we can't modulo indices for compress pattern. */
3295 if (known_ge (d
->perm
[vlen
- 1], vlen
* 2) || vlen
< 4)
3298 /* Compress pattern doesn't work for one vector. */
3299 if (d
->one_vector_p
)
3302 /* Compress point is the point that all elements value with index i >=
3303 compress point of the selector are all consecutive series increasing and
3304 each selector value >= NUNITS. In this case, we could compress all elements
3305 of i < compress point into the op1. */
3306 int compress_point
= -1;
3307 for (int i
= 0; i
< vlen
; i
++)
3309 if (compress_point
< 0 && known_ge (d
->perm
[i
], vec_len
))
3316 /* We don't apply compress approach if we can't find the compress point. */
3317 if (compress_point
< 0)
3320 /* We can only apply compress approach when all index values from 0 to
3321 compress point are increasing. */
3322 for (int i
= 1; i
< compress_point
; i
++)
3323 if (maybe_le (d
->perm
[i
], d
->perm
[i
- 1]))
3326 /* It must be series increasing from compress point. */
3327 for (int i
= 1 + compress_point
; i
< vlen
; i
++)
3328 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3335 /* Check whether we need to slideup op1 to apply compress approach.
3337 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3338 is 2 * NUNITS - 1, so we don't need to slide up.
3340 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3341 we apply compress approach. */
3342 bool need_slideup_p
= maybe_ne (d
->perm
[vlen
- 1], 2 * vec_len
- 1)
3343 && !const_vec_duplicate_p (d
->op1
);
3345 /* If we leave it directly be handled by general gather,
3346 the code sequence will be:
3347 VECTOR LOAD selector
3348 GEU mask, selector, NUNITS
3349 GATHER dest, op0, selector
3350 SUB selector, selector, NUNITS
3351 GATHER dest, op1, selector, mask
3352 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3353 as COST = 4. So, we consider the general gather handling COST = 9.
3354 TODO: This cost is not accurate, we can adjust it by tune info. */
3355 int general_cost
= 9;
3357 /* If we can use compress approach, the code sequence will be:
3359 COMPRESS op1, op0, mask
3360 If it needs slide up, it will be:
3363 COMPRESS op1, op0, mask
3364 By default, mask load COST = 2.
3365 TODO: This cost is not accurate, we can adjust it by tune info. */
3366 int compress_cost
= 4;
3368 if (general_cost
<= compress_cost
)
3371 /* Build a mask that is true when selector element is true. */
3372 machine_mode mask_mode
= get_mask_mode (vmode
);
3373 rvv_builder
builder (mask_mode
, vlen
, 1);
3374 for (int i
= 0; i
< vlen
; i
++)
3376 bool is_compress_index
= false;
3377 for (int j
= 0; j
< compress_point
; j
++)
3379 if (known_eq (d
->perm
[j
], i
))
3381 is_compress_index
= true;
3385 if (is_compress_index
)
3386 builder
.quick_push (CONST1_RTX (BImode
));
3388 builder
.quick_push (CONST0_RTX (BImode
));
3390 rtx mask
= force_reg (mask_mode
, builder
.build ());
3395 int slideup_cnt
= vlen
- (d
->perm
[vlen
- 1].to_constant () % vlen
) - 1;
3396 merge
= gen_reg_rtx (vmode
);
3397 rtx ops
[] = {merge
, d
->op1
, gen_int_mode (slideup_cnt
, Pmode
)};
3398 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3399 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3402 insn_code icode
= code_for_pred_compress (vmode
);
3403 rtx ops
[] = {d
->target
, merge
, d
->op0
, mask
};
3404 emit_nonvlmax_insn (icode
, COMPRESS_OP_MERGE
, ops
,
3405 gen_int_mode (vlen
, Pmode
));
3409 /* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower
3410 or the higher parts of both vectors are combined into one. */
3413 shuffle_slide_patterns (struct expand_vec_perm_d
*d
)
3415 machine_mode vmode
= d
->vmode
;
3416 poly_int64 vec_len
= d
->perm
.length ();
3418 if (!vec_len
.is_constant ())
3421 int vlen
= vec_len
.to_constant ();
3425 if (d
->one_vector_p
)
3428 /* For a slideup OP0 can stay, for a slidedown OP1 can.
3429 The former requires that the first element of the permutation
3430 is the first element of OP0, the latter that the last permutation
3431 element is the last element of OP1. */
3432 bool slideup
= false;
3433 bool slidedown
= false;
3435 /* For a slideup the permutation must start at OP0's first element. */
3436 if (known_eq (d
->perm
[0], 0))
3439 /* For a slidedown the permutation must end at OP1's last element. */
3440 if (known_eq (d
->perm
[vlen
- 1], 2 * vlen
- 1))
3443 if (slideup
&& slidedown
)
3446 if (!slideup
&& !slidedown
)
3449 /* Check for a monotonic sequence with one pivot. */
3451 for (int i
= 0; i
< vlen
; i
++)
3453 if (pivot
== -1 && known_ge (d
->perm
[i
], vec_len
))
3455 if (i
> 0 && i
!= pivot
3456 && maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3463 /* For a slideup OP1's part (to be slid up) must be a low part,
3464 i.e. starting with its first element. */
3465 if (slideup
&& maybe_ne (d
->perm
[pivot
], vlen
))
3468 /* For a slidedown OP0's part (to be slid down) must be a high part,
3469 i.e. ending with its last element. */
3470 if (slidedown
&& maybe_ne (d
->perm
[pivot
- 1], vlen
- 1))
3477 /* PIVOT is the start of the lower/higher part of OP1 or OP2.
3478 For a slideup it indicates how many elements of OP1 to
3479 skip/slide over. For a slidedown it indicates how long
3480 OP1's high part is, while VLEN - PIVOT is the amount to slide. */
3481 int slide_cnt
= slideup
? pivot
: vlen
- pivot
;
3485 /* No need for a vector length because we slide up until the
3486 end of OP1 anyway. */
3487 rtx ops
[] = {d
->target
, d
->op0
, d
->op1
, gen_int_mode (slide_cnt
, Pmode
)};
3488 icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3489 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
3493 /* Here we need a length because we slide to the beginning of OP1
3494 leaving the remaining elements undisturbed. */
3496 rtx ops
[] = {d
->target
, d
->op1
, d
->op0
,
3497 gen_int_mode (slide_cnt
, Pmode
)};
3498 icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, vmode
);
3499 emit_nonvlmax_insn (icode
, BINARY_OP_TUMA
, ops
,
3500 gen_int_mode (len
, Pmode
));
3506 /* Recognize interleaving patterns like [0 4 1 5]. */
3509 shuffle_interleave_patterns (struct expand_vec_perm_d
*d
)
3511 machine_mode vmode
= d
->vmode
;
3512 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3513 poly_int64 vec_len
= d
->perm
.length ();
3514 int n_patterns
= d
->perm
.encoding ().npatterns ();
3516 if (!vec_len
.is_constant ())
3519 if (n_patterns
!= 2)
3522 unsigned vlen
= vec_len
.to_constant ();
3524 if (vlen
< 4 || vlen
> 64)
3527 if (d
->one_vector_p
)
3531 if (d
->perm
.series_p (0, 2, 0, 1)
3532 && d
->perm
.series_p (1, 2, vlen
, 1))
3534 else if (d
->perm
.series_p (0, 2, vlen
/ 2, 1)
3535 && d
->perm
.series_p (1, 2, vlen
+ vlen
/ 2, 1))
3540 vec_perm_builder
sel (vlen
, 2, 1);
3541 sel
.safe_grow (vlen
);
3543 for (unsigned i
= 0; i
< vlen
; i
+= 2)
3546 sel
[i
+ 1] = cnt
+ vlen
/ 2;
3550 vec_perm_indices
indices (sel
, 2, vlen
);
3552 if (vlen
!= indices
.length ().to_constant ())
3559 int slide_cnt
= vlen
/ 2;
3560 rtx tmp
= gen_reg_rtx (vmode
);
3564 /* No need for a vector length because we slide up until the
3565 end of OP1 anyway. */
3566 rtx ops
[] = {tmp
, d
->op0
, d
->op1
, gen_int_mode (slide_cnt
, Pmode
)};
3567 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3568 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
3572 rtx ops
[] = {tmp
, d
->op1
, d
->op0
, gen_int_mode (slide_cnt
, Pmode
)};
3573 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, vmode
);
3574 emit_nonvlmax_insn (icode
, BINARY_OP_TUMA
, ops
,
3575 gen_int_mode (slide_cnt
, Pmode
));
3578 rtx sel_rtx
= vec_perm_indices_to_rtx (sel_mode
, indices
);
3579 emit_vlmax_gather_insn (gen_lowpart (vmode
, d
->target
), tmp
, sel_rtx
);
3585 /* Recognize even/odd patterns like [0 2 4 6]. We use two compress
3589 shuffle_even_odd_patterns (struct expand_vec_perm_d
*d
)
3591 machine_mode vmode
= d
->vmode
;
3592 poly_int64 vec_len
= d
->perm
.length ();
3593 int n_patterns
= d
->perm
.encoding ().npatterns ();
3595 if (n_patterns
!= 1)
3598 if (!vec_len
.is_constant ())
3601 int vlen
= vec_len
.to_constant ();
3602 if (vlen
< 4 || vlen
> 64)
3605 if (d
->one_vector_p
)
3609 if (!d
->perm
.series_p (0, 1, 0, 2))
3612 if (!d
->perm
.series_p (0, 1, 1, 2))
3620 machine_mode mask_mode
= get_mask_mode (vmode
);
3621 rvv_builder
builder (mask_mode
, vlen
, 1);
3622 int bit
= even
? 0 : 1;
3623 for (int i
= 0; i
< vlen
; i
++)
3627 builder
.quick_push (CONST1_RTX (BImode
));
3629 builder
.quick_push (CONST0_RTX (BImode
));
3631 rtx mask
= force_reg (mask_mode
, builder
.build ());
3633 insn_code icode
= code_for_pred_compress (vmode
);
3634 rtx ops1
[] = {d
->target
, d
->op0
, mask
};
3635 emit_vlmax_insn (icode
, COMPRESS_OP
, ops1
);
3637 rtx tmp2
= gen_reg_rtx (vmode
);
3638 rtx ops2
[] = {tmp2
, d
->op1
, mask
};
3639 emit_vlmax_insn (icode
, COMPRESS_OP
, ops2
);
3641 rtx ops
[] = {d
->target
, d
->target
, tmp2
, gen_int_mode (vlen
/ 2, Pmode
)};
3642 icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3643 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
3648 /* Recognize decompress patterns:
3650 1. VEC_PERM_EXPR op0 and op1
3651 with isel = { 0, nunits, 1, nunits + 1, ... }.
3652 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3654 2. VEC_PERM_EXPR op0 and op1
3655 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3656 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3657 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3660 shuffle_decompress_patterns (struct expand_vec_perm_d
*d
)
3662 poly_uint64 nelt
= d
->perm
.length ();
3663 machine_mode mask_mode
= get_mask_mode (d
->vmode
);
3665 /* For constant size indices, we dont't need to handle it here.
3666 Just leave it to vec_perm<mode>. */
3667 if (d
->perm
.length ().is_constant ())
3670 poly_uint64 first
= d
->perm
[0];
3671 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
3672 || !d
->perm
.series_p (0, 2, first
, 1)
3673 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
3676 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3677 Otherwise, it could overflow the index range. */
3678 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3679 if (GET_MODE_INNER (d
->vmode
) == QImode
3680 && !get_vector_mode (HImode
, nelt
).exists (&sel_mode
))
3688 if (known_eq (first
, 0U))
3695 op0
= gen_reg_rtx (d
->vmode
);
3696 op1
= gen_reg_rtx (d
->vmode
);
3697 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, d
->vmode
);
3698 rtx ops0
[] = {op0
, d
->op0
, gen_int_mode (first
, Pmode
)};
3699 rtx ops1
[] = {op1
, d
->op1
, gen_int_mode (first
, Pmode
)};
3700 emit_vlmax_insn (icode
, BINARY_OP
, ops0
);
3701 emit_vlmax_insn (icode
, BINARY_OP
, ops1
);
3703 /* Generate { 0, 1, .... } mask. */
3704 rtx vid
= gen_reg_rtx (sel_mode
);
3705 rtx vid_repeat
= gen_reg_rtx (sel_mode
);
3706 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
3707 rtx and_ops
[] = {vid_repeat
, vid
, const1_rtx
};
3708 emit_vlmax_insn (code_for_pred_scalar (AND
, sel_mode
), BINARY_OP
, and_ops
);
3709 rtx const_vec
= gen_const_vector_dup (sel_mode
, 1);
3710 rtx mask
= gen_reg_rtx (mask_mode
);
3711 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
3712 emit_vlmax_decompress_insn (d
->target
, op0
, op1
, mask
);
3717 shuffle_bswap_pattern (struct expand_vec_perm_d
*d
)
3720 unsigned i
, size
, step
;
3722 if (!d
->one_vector_p
|| !d
->perm
[0].is_constant (&diff
) || !diff
)
3726 size
= step
* GET_MODE_UNIT_BITSIZE (d
->vmode
);
3734 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3735 __builtin_bswap. It will generate about 9 instructions in
3736 loop as below, no matter it is bswap16, bswap32 or bswap64.
3742 5 vrgatherei16.vv v1,v4,v2
3749 But for bswap16 we may have a even simple code gen, which
3750 has only 7 instructions in loop as below.
3761 Unfortunately, the instructions in loop will grow to 13 and 24
3762 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3763 for both the bswap64 and bswap32, but take shift and or (7 insn)
3770 for (i
= 0; i
< step
; i
++)
3771 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
3774 /* Disable when nunits < 4 since the later generic approach
3775 is more profitable on BSWAP. */
3776 if (!known_gt (GET_MODE_NUNITS (d
->vmode
), 2))
3782 machine_mode vhi_mode
;
3783 poly_uint64 vhi_nunits
= exact_div (GET_MODE_NUNITS (d
->vmode
), 2);
3785 if (!get_vector_mode (HImode
, vhi_nunits
).exists (&vhi_mode
))
3788 /* Step-1: Move op0 to src with VHI mode. */
3789 rtx src
= gen_reg_rtx (vhi_mode
);
3790 emit_move_insn (src
, gen_lowpart (vhi_mode
, d
->op0
));
3792 /* Step-2: Shift right 8 bits to dest. */
3793 rtx dest
= expand_binop (vhi_mode
, lshr_optab
, src
, gen_int_mode (8, Pmode
),
3794 NULL_RTX
, 0, OPTAB_DIRECT
);
3796 /* Step-3: Shift left 8 bits to src. */
3797 src
= expand_binop (vhi_mode
, ashl_optab
, src
, gen_int_mode (8, Pmode
),
3798 NULL_RTX
, 0, OPTAB_DIRECT
);
3800 /* Step-4: Logic Or dest and src to dest. */
3801 dest
= expand_binop (vhi_mode
, ior_optab
, dest
, src
,
3802 NULL_RTX
, 0, OPTAB_DIRECT
);
3804 /* Step-5: Move src to target with VQI mode. */
3805 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
3810 /* Recognize patterns like [3 4 5 6] where we combine the last element
3811 of the first vector and the first n - 1 elements of the second vector.
3812 This can be implemented by slides or by extracting and re-inserting
3813 (slide1up) the first vector's last element. */
3816 shuffle_off_by_one_patterns (struct expand_vec_perm_d
*d
)
3818 poly_int64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3820 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3821 if (!d
->perm
.series_p (0, 2, nunits
- 1, 2)
3822 || !d
->perm
.series_p (1, 2, nunits
, 2))
3825 /* Disable when nunits < 4 since the later generic approach
3826 is more profitable on indice = { nunits - 1, nunits }. */
3827 if (!known_gt (nunits
, 2))
3834 int scalar_cost
= riscv_register_move_cost (d
->vmode
, V_REGS
, GR_REGS
)
3835 + riscv_register_move_cost (d
->vmode
, GR_REGS
, V_REGS
) + 2;
3838 if (slide_cost
< scalar_cost
)
3840 /* This variant should always be preferable because we just need two
3841 slides. The extract-variant also requires two slides but additionally
3842 pays the latency for register-file crossing. */
3843 rtx tmp
= gen_reg_rtx (d
->vmode
);
3844 rtx ops
[] = {tmp
, d
->op1
, gen_int_mode (1, Pmode
)};
3845 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, d
->vmode
);
3846 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3848 rtx ops2
[] = {d
->target
, tmp
, d
->op0
, gen_int_mode (nunits
- 1, Pmode
)};
3849 icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, d
->vmode
);
3850 emit_nonvlmax_insn (icode
, BINARY_OP_TUMA
, ops2
, gen_int_mode (1, Pmode
));
3854 /* Extract the last element of the first vector. */
3855 scalar_mode smode
= GET_MODE_INNER (d
->vmode
);
3856 rtx tmp
= gen_reg_rtx (smode
);
3857 emit_vec_extract (tmp
, d
->op0
, gen_int_mode (nunits
- 1, Pmode
));
3859 /* Insert the scalar into element 0. */
3861 = FLOAT_MODE_P (d
->vmode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
3862 insn_code icode
= code_for_pred_slide (unspec
, d
->vmode
);
3863 rtx ops
[] = {d
->target
, d
->op1
, tmp
};
3864 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3870 /* This looks for a series pattern in the provided vector permute structure D.
3871 If successful it emits a series insn as well as a gather to implement it.
3872 Return true if successful, false otherwise. */
3875 shuffle_series_patterns (struct expand_vec_perm_d
*d
)
3877 if (!d
->one_vector_p
|| d
->perm
.encoding ().npatterns () != 1)
3880 poly_int64 el1
= d
->perm
[0];
3881 poly_int64 el2
= d
->perm
[1];
3882 poly_int64 el3
= d
->perm
[2];
3884 poly_int64 step1
= el2
- el1
;
3885 poly_int64 step2
= el3
- el2
;
3887 bool need_insert
= false;
3888 bool have_series
= false;
3890 /* Check for a full series. */
3891 if (known_ne (step1
, 0) && d
->perm
.series_p (0, 1, el1
, step1
))
3894 /* Check for a series starting at the second element. */
3895 else if (known_ne (step2
, 0) && d
->perm
.series_p (1, 1, el2
, step2
))
3904 /* Disable shuffle if we can't find an appropriate integer index mode for
3906 machine_mode sel_mode
;
3907 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3914 /* Create the series. */
3915 machine_mode eltmode
= Pmode
;
3916 rtx series
= gen_reg_rtx (sel_mode
);
3917 expand_vec_series (series
, gen_int_mode (need_insert
? el2
: el1
, eltmode
),
3918 gen_int_mode (need_insert
? step2
: step1
, eltmode
));
3920 /* Insert the remaining element if necessary. */
3923 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDE1UP
, sel_mode
);
3925 = {series
, series
, gen_int_mode (el1
, GET_MODE_INNER (sel_mode
))};
3926 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3929 emit_vlmax_gather_insn (d
->target
, d
->op0
, series
);
3934 /* Recognize the pattern that can be shuffled by generic approach. */
3937 shuffle_generic_patterns (struct expand_vec_perm_d
*d
)
3939 machine_mode sel_mode
;
3941 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3942 if (!pow2p_hwi (d
->perm
.encoding().npatterns ()))
3945 /* Disable shuffle if we can't find an appropriate integer index mode for
3947 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3950 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3951 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
3954 bool is_simple
= d
->one_vector_p
3955 || const_vec_duplicate_p (sel
, &elt
)
3956 || (nunits
.is_constant ()
3957 && const_vec_all_in_range_p (sel
, 0, nunits
- 1));
3959 if (!is_simple
&& !riscv_two_source_permutes
)
3966 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3967 instead of expand vec_perm<mode>, we handle it directly. */
3968 expand_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
3972 /* This function recognizes and supports different permutation patterns
3973 and enable VLA SLP auto-vectorization. */
3975 expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
3977 gcc_assert (d
->op_mode
!= E_VOIDmode
);
3979 /* The pattern matching functions above are written to look for a small
3980 number to begin the sequence (0, 1, N/2). If we begin with an index
3981 from the second operand, we can swap the operands. */
3982 poly_int64 nelt
= d
->perm
.length ();
3983 if (known_ge (d
->perm
[0], nelt
))
3985 d
->perm
.rotate_inputs (1);
3986 std::swap (d
->op0
, d
->op1
);
3989 if (known_gt (nelt
, 1))
3991 if (d
->vmode
== d
->op_mode
)
3993 if (shuffle_merge_patterns (d
))
3995 if (shuffle_consecutive_patterns (d
))
3997 if (shuffle_slide_patterns (d
))
3999 if (shuffle_interleave_patterns (d
))
4001 if (shuffle_even_odd_patterns (d
))
4003 if (shuffle_compress_patterns (d
))
4005 if (shuffle_decompress_patterns (d
))
4007 if (shuffle_bswap_pattern (d
))
4009 if (shuffle_off_by_one_patterns (d
))
4011 if (shuffle_series_patterns (d
))
4013 if (shuffle_generic_patterns (d
))
4023 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
4026 expand_vec_perm_const (machine_mode vmode
, machine_mode op_mode
, rtx target
,
4027 rtx op0
, rtx op1
, const vec_perm_indices
&sel
)
4029 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
4030 mask to do the iteration loop control. Just disable it directly. */
4031 if (GET_MODE_CLASS (vmode
) == MODE_VECTOR_BOOL
)
4034 struct expand_vec_perm_d d
;
4036 /* Check whether the mask can be applied to a single vector. */
4037 if (sel
.ninputs () == 1 || (op0
&& rtx_equal_p (op0
, op1
)))
4038 d
.one_vector_p
= true;
4039 else if (sel
.all_from_input_p (0))
4041 d
.one_vector_p
= true;
4044 else if (sel
.all_from_input_p (1))
4046 d
.one_vector_p
= true;
4050 d
.one_vector_p
= false;
4052 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
4053 sel
.nelts_per_input ());
4055 d
.op_mode
= op_mode
;
4062 d
.testing_p
= !target
;
4065 return expand_vec_perm_const_1 (&d
);
4067 rtx_insn
*last
= get_last_insn ();
4068 bool ret
= expand_vec_perm_const_1 (&d
);
4069 gcc_assert (last
== get_last_insn ());
4074 /* Generate no side effects vsetvl to get the vector length. */
4076 expand_select_vl (rtx
*ops
)
4078 poly_int64 nunits
= rtx_to_poly_int64 (ops
[2]);
4079 if (CONST_INT_P (ops
[1]) && known_le (INTVAL (ops
[1]), nunits
))
4081 /* If length is known <= VF, we just use the length directly instead
4084 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
4085 We move 3 into _255 instead of using explicit vsetvl. */
4086 emit_move_insn (ops
[0], ops
[1]);
4089 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
4090 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
4091 scalar_int_mode mode
= QImode
;
4092 machine_mode rvv_mode
= get_vector_mode (mode
, nunits
).require ();
4093 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode
, ops
[0], ops
[1]));
4096 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
4098 get_else_operand (rtx op
)
4100 return GET_CODE (op
) == SCRATCH
? RVV_VUNDEF (GET_MODE (op
)) : op
;
4103 /* Expand MASK_LEN_{LOAD,STORE}. */
4105 expand_load_store (rtx
*ops
, bool is_load
)
4108 rtx mask
= ops
[idx
++];
4109 /* A masked load has a merge/else operand. */
4111 get_else_operand (ops
[idx
++]);
4113 machine_mode mode
= GET_MODE (ops
[0]);
4115 if (is_vlmax_len_p (mode
, len
))
4117 /* If the length operand is equal to VF, it is VLMAX load/store. */
4120 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
4121 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
);
4125 len
= gen_reg_rtx (Pmode
);
4126 emit_vlmax_vsetvl (mode
, len
);
4127 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
4128 get_avl_type_rtx (VLMAX
)));
4133 if (!satisfies_constraint_K (len
))
4134 len
= force_reg (Pmode
, len
);
4137 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
4138 emit_nonvlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
,
4142 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
4143 get_avl_type_rtx (NONVLMAX
)));
4147 /* Expand MASK_LEN_STRIDED_LOAD. */
4149 expand_strided_load (machine_mode mode
, rtx
*ops
)
4153 rtx stride
= ops
[2];
4156 get_else_operand (ops
[idx
++]);
4160 insn_code icode
= code_for_pred_strided_load (mode
);
4161 rtx emit_ops
[] = {v_reg
, mask
, gen_rtx_MEM (mode
, base
), stride
};
4163 if (poly_int_rtx_p (len
, &len_val
)
4164 && known_eq (len_val
, GET_MODE_NUNITS (mode
)))
4165 emit_vlmax_insn (icode
, BINARY_OP_TAMA
, emit_ops
);
4168 len
= satisfies_constraint_K (len
) ? len
: force_reg (Pmode
, len
);
4169 emit_nonvlmax_insn (icode
, BINARY_OP_TAMA
, emit_ops
, len
);
4173 /* Expand MASK_LEN_STRIDED_STORE. */
4175 expand_strided_store (machine_mode mode
, rtx
*ops
)
4179 rtx stride
= ops
[1];
4185 if (poly_int_rtx_p (len
, &len_val
)
4186 && known_eq (len_val
, GET_MODE_NUNITS (mode
)))
4188 len
= gen_reg_rtx (Pmode
);
4189 emit_vlmax_vsetvl (mode
, len
);
4190 vl_type
= get_avl_type_rtx (VLMAX
);
4194 len
= satisfies_constraint_K (len
) ? len
: force_reg (Pmode
, len
);
4195 vl_type
= get_avl_type_rtx (NONVLMAX
);
4198 emit_insn (gen_pred_strided_store (mode
, gen_rtx_MEM (mode
, base
),
4199 mask
, stride
, v_reg
, len
, vl_type
));
4202 /* Return true if the operation is the floating-point operation need FRM. */
4204 needs_fp_rounding (unsigned icode
, machine_mode mode
)
4206 if (!FLOAT_MODE_P (mode
))
4209 return icode
!= maybe_code_for_pred (SMIN
, mode
)
4210 && icode
!= maybe_code_for_pred (UNSPEC_VFMIN
, mode
)
4211 && icode
!= maybe_code_for_pred (SMAX
, mode
)
4212 && icode
!= maybe_code_for_pred (UNSPEC_VFMAX
, mode
)
4213 && icode
!= maybe_code_for_pred (NEG
, mode
)
4214 && icode
!= maybe_code_for_pred (ABS
, mode
)
4215 /* narrower-FP -> FP */
4216 && icode
!= maybe_code_for_pred_extend (mode
)
4217 /* narrower-INT -> FP */
4218 && icode
!= maybe_code_for_pred_widen (FLOAT
, mode
)
4219 && icode
!= maybe_code_for_pred_widen (UNSIGNED_FLOAT
, mode
)
4221 && icode
!= maybe_code_for_pred (UNSPEC_VCOPYSIGN
, mode
)
4222 && icode
!= maybe_code_for_pred_mov (mode
);
4225 /* Subroutine to expand COND_LEN_* patterns. */
4227 expand_cond_len_op (unsigned icode
, insn_flags op_type
, rtx
*ops
, rtx len
)
4231 machine_mode mode
= GET_MODE (dest
);
4232 machine_mode mask_mode
= GET_MODE (mask
);
4233 bool is_dummy_mask
= rtx_equal_p (mask
, CONSTM1_RTX (mask_mode
));
4234 bool is_vlmax_len
= is_vlmax_len_p (mode
, len
);
4236 unsigned insn_flags
= HAS_DEST_P
| HAS_MASK_P
| HAS_MERGE_P
| op_type
;
4237 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
4238 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
4239 simplification in RISC-V backend and may do that in middle-end in the
4241 if (is_dummy_mask
&& is_vlmax_len
)
4242 insn_flags
|= TDEFAULT_POLICY_P
| MDEFAULT_POLICY_P
;
4243 else if (is_dummy_mask
)
4244 insn_flags
|= TU_POLICY_P
| MDEFAULT_POLICY_P
;
4245 else if (is_vlmax_len
)
4246 insn_flags
|= TDEFAULT_POLICY_P
| MU_POLICY_P
;
4248 insn_flags
|= TU_POLICY_P
| MU_POLICY_P
;
4250 if (needs_fp_rounding (icode
, mode
))
4251 insn_flags
|= FRM_DYN_P
;
4254 emit_vlmax_insn (icode
, insn_flags
, ops
);
4256 emit_nonvlmax_insn (icode
, insn_flags
, ops
, len
);
4259 /* Expand unary ops COND_LEN_*. */
4261 expand_cond_len_unop (unsigned icode
, rtx
*ops
)
4266 rtx merge
= get_else_operand (ops
[3]);
4269 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
4270 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
4273 /* Expand unary ops COND_*. */
4275 expand_cond_unop (unsigned icode
, rtx
*ops
)
4280 rtx merge
= get_else_operand (ops
[3]);
4281 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4283 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
4284 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
4287 /* Expand binary ops COND_LEN_*. */
4289 expand_cond_len_binop (unsigned icode
, rtx
*ops
)
4295 rtx merge
= get_else_operand (ops
[4]);
4298 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
4299 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
4302 /* Expand binary ops COND_*. */
4304 expand_cond_binop (unsigned icode
, rtx
*ops
)
4310 rtx merge
= get_else_operand (ops
[4]);
4311 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4313 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
4314 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
4317 /* Prepare insn_code for gather_load/scatter_store according to
4318 the vector mode and index mode. */
4320 prepare_gather_scatter (machine_mode vec_mode
, machine_mode idx_mode
,
4324 return code_for_pred_indexed_store (UNSPEC_UNORDERED
, vec_mode
, idx_mode
);
4327 unsigned src_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode
));
4328 unsigned dst_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode
));
4329 if (dst_eew_bitsize
== src_eew_bitsize
)
4330 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED
, vec_mode
);
4331 else if (dst_eew_bitsize
> src_eew_bitsize
)
4333 unsigned factor
= dst_eew_bitsize
/ src_eew_bitsize
;
4337 return code_for_pred_indexed_load_x2_greater_eew (
4338 UNSPEC_UNORDERED
, vec_mode
);
4340 return code_for_pred_indexed_load_x4_greater_eew (
4341 UNSPEC_UNORDERED
, vec_mode
);
4343 return code_for_pred_indexed_load_x8_greater_eew (
4344 UNSPEC_UNORDERED
, vec_mode
);
4351 unsigned factor
= src_eew_bitsize
/ dst_eew_bitsize
;
4355 return code_for_pred_indexed_load_x2_smaller_eew (
4356 UNSPEC_UNORDERED
, vec_mode
);
4358 return code_for_pred_indexed_load_x4_smaller_eew (
4359 UNSPEC_UNORDERED
, vec_mode
);
4361 return code_for_pred_indexed_load_x8_smaller_eew (
4362 UNSPEC_UNORDERED
, vec_mode
);
4370 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
4372 expand_gather_scatter (rtx
*ops
, bool is_load
)
4374 rtx ptr
, vec_offset
, vec_reg
;
4385 vec_offset
= ops
[2];
4386 zero_extend_p
= INTVAL (ops
[3]);
4387 shift
= exact_log2 (INTVAL (ops
[4]));
4393 vec_offset
= ops
[1];
4394 zero_extend_p
= INTVAL (ops
[2]);
4395 shift
= exact_log2 (INTVAL (ops
[3]));
4398 machine_mode vec_mode
= GET_MODE (vec_reg
);
4399 machine_mode idx_mode
= GET_MODE (vec_offset
);
4400 scalar_mode inner_idx_mode
= GET_MODE_INNER (idx_mode
);
4401 unsigned inner_offsize
= GET_MODE_BITSIZE (inner_idx_mode
);
4402 poly_int64 nunits
= GET_MODE_NUNITS (vec_mode
);
4403 bool is_vlmax
= is_vlmax_len_p (vec_mode
, len
);
4405 bool use_widening_shift
= false;
4407 /* Extend the offset element to address width. */
4408 if (inner_offsize
< BITS_PER_WORD
)
4410 use_widening_shift
= TARGET_ZVBB
&& zero_extend_p
&& shift
== 1;
4411 /* 7.2. Vector Load/Store Addressing Modes.
4412 If the vector offset elements are narrower than XLEN, they are
4413 zero-extended to XLEN before adding to the ptr effective address. If
4414 the vector offset elements are wider than XLEN, the least-significant
4415 XLEN bits are used in the address calculation. An implementation must
4416 raise an illegal instruction exception if the EEW is not supported for
4419 RVV spec only refers to the shift == 0 case. */
4420 if (!zero_extend_p
|| shift
)
4424 = int_mode_for_size (inner_offsize
* 2, 0).require ();
4426 inner_idx_mode
= int_mode_for_size (BITS_PER_WORD
, 0).require ();
4427 machine_mode new_idx_mode
4428 = get_vector_mode (inner_idx_mode
, nunits
).require ();
4429 if (!use_widening_shift
)
4431 rtx tmp
= gen_reg_rtx (new_idx_mode
);
4432 emit_insn (gen_extend_insn (tmp
, vec_offset
, new_idx_mode
, idx_mode
,
4433 zero_extend_p
? true : false));
4436 idx_mode
= new_idx_mode
;
4443 if (!use_widening_shift
)
4444 tmp
= expand_binop (idx_mode
, ashl_optab
, vec_offset
,
4445 gen_int_mode (shift
, Pmode
), NULL_RTX
, 0,
4449 tmp
= gen_reg_rtx (idx_mode
);
4450 insn_code icode
= code_for_pred_vwsll_scalar (idx_mode
);
4451 rtx ops
[] = {tmp
, vec_offset
, const1_rtx
};
4452 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
4458 insn_code icode
= prepare_gather_scatter (vec_mode
, idx_mode
, is_load
);
4464 = {vec_reg
, mask
, ptr
, vec_offset
};
4465 emit_vlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
);
4469 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4470 emit_vlmax_insn (icode
, SCATTER_OP_M
, store_ops
);
4478 = {vec_reg
, mask
, ptr
, vec_offset
};
4479 emit_nonvlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
, len
);
4483 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4484 emit_nonvlmax_insn (icode
, SCATTER_OP_M
, store_ops
, len
);
4489 /* Expand COND_LEN_*. */
4491 expand_cond_len_ternop (unsigned icode
, rtx
*ops
)
4498 rtx merge
= get_else_operand (ops
[5]);
4501 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4502 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4505 /* Expand COND_*. */
4507 expand_cond_ternop (unsigned icode
, rtx
*ops
)
4514 rtx merge
= get_else_operand (ops
[5]);
4515 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4517 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4518 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4521 /* Expand reduction operations.
4522 Case 1: ops = {scalar_dest, vector_src}
4523 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4526 expand_reduction (unsigned unspec
, unsigned unspec_for_vl0_safe
,
4527 unsigned insn_flags
, rtx
*ops
, rtx init
)
4529 rtx scalar_dest
= ops
[0];
4530 rtx vector_src
= ops
[1];
4531 machine_mode vmode
= GET_MODE (vector_src
);
4532 machine_mode vel_mode
= GET_MODE (scalar_dest
);
4533 machine_mode m1_mode
= get_m1_mode (vel_mode
).require ();
4534 rtx vl_op
= NULL_RTX
;
4535 bool need_vl0_safe
= false;
4536 if (need_mask_operand_p (insn_flags
))
4539 need_vl0_safe
= !CONST_INT_P (vl_op
) && !CONST_POLY_INT_P (vl_op
);
4542 rtx m1_tmp
= gen_reg_rtx (m1_mode
);
4543 rtx scalar_move_ops
[] = {m1_tmp
, init
};
4544 insn_code icode
= code_for_pred_broadcast (m1_mode
);
4545 if (need_mask_operand_p (insn_flags
))
4548 emit_nonvlmax_insn (icode
, SCALAR_MOVE_OP
, scalar_move_ops
, const1_rtx
);
4550 emit_nonvlmax_insn (icode
, SCALAR_MOVE_OP
, scalar_move_ops
, vl_op
);
4553 emit_vlmax_insn (icode
, SCALAR_MOVE_OP
, scalar_move_ops
);
4555 rtx m1_tmp2
= gen_reg_rtx (m1_mode
);
4556 rtx reduc_ops
[] = {m1_tmp2
, vector_src
, m1_tmp
};
4559 icode
= code_for_pred (unspec_for_vl0_safe
, vmode
);
4561 icode
= code_for_pred (unspec
, vmode
);
4563 if (need_mask_operand_p (insn_flags
))
4565 rtx mask_len_reduc_ops
[] = {m1_tmp2
, ops
[2], vector_src
, m1_tmp
};
4566 emit_nonvlmax_insn (icode
, insn_flags
, mask_len_reduc_ops
, vl_op
);
4569 emit_vlmax_insn (icode
, insn_flags
, reduc_ops
);
4571 emit_insn (gen_pred_extract_first (m1_mode
, scalar_dest
, m1_tmp2
));
4574 /* Prepare ops for ternary operations.
4575 It can be called before or after RA. */
4577 prepare_ternary_operands (rtx
*ops
)
4579 machine_mode mode
= GET_MODE (ops
[0]);
4581 if (!rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4582 && (VECTOR_MODE_P (GET_MODE (ops
[2]))
4583 && !rtx_equal_p (ops
[2], ops
[5]))
4584 && !rtx_equal_p (ops
[3], ops
[5])
4585 && !rtx_equal_p (ops
[4], ops
[5]))
4587 /* RA will fail to find vector REG and report ICE, so we pre-merge
4588 the ops for LMUL = 8. */
4589 if (satisfies_constraint_Wc1 (ops
[1]))
4591 emit_move_insn (ops
[0], ops
[5]);
4592 emit_insn (gen_pred_mov (mode
, ops
[0], ops
[1], ops
[0], ops
[4], ops
[6],
4593 ops
[7], ops
[8], ops
[9]));
4596 emit_insn (gen_pred_merge (mode
, ops
[0], RVV_VUNDEF (mode
), ops
[5],
4597 ops
[4], ops
[1], ops
[6], ops
[7], ops
[9]));
4598 ops
[5] = ops
[4] = ops
[0];
4602 /* Swap the multiplication ops if the fallback value is the
4603 second of the two. */
4604 if (rtx_equal_p (ops
[3], ops
[5]))
4605 std::swap (ops
[2], ops
[3]);
4607 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4608 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4610 gcc_assert (rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4611 || rtx_equal_p (ops
[5], ops
[2]) || rtx_equal_p (ops
[5], ops
[4]));
4614 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4616 expand_lanes_load_store (rtx
*ops
, bool is_load
)
4622 rtx addr
= is_load
? XEXP (ops
[1], 0) : XEXP (ops
[0], 0);
4623 rtx reg
= is_load
? ops
[0] : ops
[1];
4624 machine_mode mode
= GET_MODE (ops
[0]);
4626 if (is_vlmax_len_p (mode
, len
))
4628 /* If the length operand is equal to VF, it is VLMAX load/store. */
4631 rtx m_ops
[] = {reg
, mask
, addr
};
4632 emit_vlmax_insn (code_for_pred_unit_strided_load (mode
), UNARY_OP_TAMA
,
4637 len
= gen_reg_rtx (Pmode
);
4638 emit_vlmax_vsetvl (mode
, len
);
4639 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4640 get_avl_type_rtx (VLMAX
)));
4645 if (!satisfies_constraint_K (len
))
4646 len
= force_reg (Pmode
, len
);
4649 rtx m_ops
[] = {reg
, mask
, addr
};
4650 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode
),
4651 UNARY_OP_TAMA
, m_ops
, len
);
4654 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4655 get_avl_type_rtx (NONVLMAX
)));
4659 /* Expand LEN_FOLD_EXTRACT_LAST. */
4661 expand_fold_extract_last (rtx
*ops
)
4664 rtx default_value
= ops
[1];
4666 rtx anchor
= gen_reg_rtx (Pmode
);
4667 rtx index
= gen_reg_rtx (Pmode
);
4669 rtx else_label
= gen_label_rtx ();
4670 rtx end_label
= gen_label_rtx ();
4672 machine_mode mode
= GET_MODE (vect
);
4673 machine_mode mask_mode
= GET_MODE (mask
);
4674 rtx compress_vect
= gen_reg_rtx (mode
);
4675 rtx slide_vect
= gen_reg_rtx (mode
);
4678 if (is_vlmax_len_p (mode
, len
))
4681 /* Calculate the number of 1-bit in mask. */
4682 rtx cpop_ops
[] = {anchor
, mask
};
4684 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4687 emit_vlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4690 riscv_expand_conditional_branch (else_label
, EQ
, anchor
, const0_rtx
);
4691 emit_insn (gen_rtx_SET (index
, gen_rtx_PLUS (Pmode
, anchor
, constm1_rtx
)));
4692 /* Compress the vector. */
4693 icode
= code_for_pred_compress (mode
);
4694 rtx compress_ops
[] = {compress_vect
, vect
, mask
};
4696 emit_nonvlmax_insn (icode
, COMPRESS_OP
, compress_ops
, len
);
4698 emit_vlmax_insn (icode
, COMPRESS_OP
, compress_ops
);
4699 /* Emit the slide down to index 0 in a new vector. */
4700 rtx slide_ops
[] = {slide_vect
, compress_vect
, index
};
4701 icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, mode
);
4703 emit_nonvlmax_insn (icode
, BINARY_OP
, slide_ops
, len
);
4705 emit_vlmax_insn (icode
, BINARY_OP
, slide_ops
);
4706 /* Emit v(f)mv.[xf].s. */
4707 emit_insn (gen_pred_extract_first (mode
, dst
, slide_vect
));
4709 emit_jump_insn (gen_jump (end_label
));
4711 emit_label (else_label
);
4712 emit_move_insn (dst
, default_value
);
4713 emit_label (end_label
);
4716 /* Return true if the LMUL of comparison less than or equal to one. */
4718 cmp_lmul_le_one (machine_mode mode
)
4720 if (riscv_v_ext_vector_mode_p (mode
))
4721 return known_le (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4722 else if (riscv_v_ext_vls_mode_p (mode
))
4723 return known_le (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4727 /* Return true if the LMUL of comparison greater than one. */
4729 cmp_lmul_gt_one (machine_mode mode
)
4731 if (riscv_v_ext_vector_mode_p (mode
))
4732 return known_gt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4733 else if (riscv_v_ext_vls_mode_p (mode
))
4734 return known_gt (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4738 /* Return true if the VLS mode is legal. There are 2 cases here.
4740 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4741 is the highest priority choice and should not conflict with VLS modes.
4742 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4743 VLS mode are smaller than the minimal vla.
4745 Take vlen = 2048 as example for case 2.
4747 Note: Below table based on vlen = 2048.
4748 +----------------------------------------------------+----------------------+
4749 | VLS mode | VLA mode |
4750 +----------------------------------------------------+----------------------+
4751 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4752 +------------+-----------+-----------------+---------+-----------+----------+
4753 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4754 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4755 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4756 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4757 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4758 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4759 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4760 | ... | ... | ... | ... | RVVMF64BI | 32 |
4761 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4762 +------------+-----------+-----------------+---------+-----------+----------+
4763 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4764 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4765 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4766 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4767 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4768 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4769 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4770 | ... | ... | .. | ... | RVVMF8QI | 256 |
4771 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4772 +------------+-----------+-----------------+---------+-----------+----------+
4773 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4774 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4775 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4776 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4777 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4778 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4779 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4780 | ... | ... | .. | ... | RVVMF4HI | 512 |
4781 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4782 +------------+-----------+-----------------+---------+-----------+----------+
4783 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4784 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4785 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4786 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4787 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4788 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4789 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4790 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4791 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4792 +------------+-----------+-----------------+---------+-----------+----------+
4793 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4794 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4795 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4796 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4797 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4798 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4799 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4800 | ... | ... | .. | ... | RVVM1DI | 2048 |
4801 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4802 +------------+-----------+-----------------+---------+-----------+----------+
4804 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4805 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4807 vls_mode_valid_p (machine_mode vls_mode
)
4809 if (!TARGET_VECTOR
|| TARGET_XTHEADVECTOR
)
4812 if (rvv_vector_bits
== RVV_VECTOR_BITS_SCALABLE
)
4814 if (GET_MODE_CLASS (vls_mode
) != MODE_VECTOR_BOOL
4815 && !ordered_p (TARGET_MAX_LMUL
* BITS_PER_RISCV_VECTOR
,
4816 GET_MODE_PRECISION (vls_mode
)))
4817 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4818 BITS_PER_RISCV_VECTOR.
4820 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4821 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4822 false between VLA modes with size = (128, 128) bits and VLS mode
4823 with size = 128 bits, we will end up with multiple ICEs in
4824 middle-end generic codes. */
4829 if (rvv_vector_bits
== RVV_VECTOR_BITS_ZVL
)
4831 machine_mode inner_mode
= GET_MODE_INNER (vls_mode
);
4832 int precision
= GET_MODE_PRECISION (inner_mode
).to_constant ();
4833 int min_vlmax_bitsize
= TARGET_MIN_VLEN
/ (64 / precision
);
4835 return GET_MODE_PRECISION (vls_mode
).to_constant () < min_vlmax_bitsize
;
4841 /* We don't have to convert the floating point to integer when the
4842 mantissa is zero. Thus, ther will be a limitation for both the
4843 single and double precision floating point. There will be no
4844 mantissa if the floating point is greater than the limit.
4846 1. Half floating point.
4847 +-----------+---------------+
4848 | float | binary layout |
4849 +-----------+---------------+
4851 +-----------+---------------+
4853 +-----------+---------------+
4855 +-----------+---------------+
4858 All half floating point will be unchanged for ceil if it is
4859 greater than and equal to 1024.
4861 2. Single floating point.
4862 +-----------+---------------+
4863 | float | binary layout |
4864 +-----------+---------------+
4865 | 8388607.5 | 0x4affffff |
4866 +-----------+---------------+
4867 | 8388608.0 | 0x4b000000 |
4868 +-----------+---------------+
4869 | 8388609.0 | 0x4b000001 |
4870 +-----------+---------------+
4873 All single floating point will be unchanged for ceil if it is
4874 greater than and equal to 8388608.
4876 3. Double floating point.
4877 +--------------------+--------------------+
4878 | float | binary layout |
4879 +--------------------+--------------------+
4880 | 4503599627370495.5 | 0X432fffffffffffff |
4881 +--------------------+--------------------+
4882 | 4503599627370496.0 | 0X4330000000000000 |
4883 +--------------------+--------------------+
4884 | 4503599627370497.0 | 0X4340000000000000 |
4885 +--------------------+--------------------+
4888 All double floating point will be unchanged for ceil if it is
4889 greater than and equal to 4503599627370496.
4892 get_fp_rounding_coefficient (machine_mode inner_mode
)
4894 REAL_VALUE_TYPE real
;
4896 if (inner_mode
== E_HFmode
)
4897 real_from_integer (&real
, inner_mode
, 1024, SIGNED
);
4898 else if (inner_mode
== E_SFmode
)
4899 real_from_integer (&real
, inner_mode
, 8388608, SIGNED
);
4900 else if (inner_mode
== E_DFmode
)
4901 real_from_integer (&real
, inner_mode
, 4503599627370496, SIGNED
);
4905 return const_double_from_real_value (real
, inner_mode
);
4909 emit_vec_float_cmp_mask (rtx fp_vector
, rtx_code code
, rtx fp_scalar
,
4910 machine_mode vec_fp_mode
)
4912 /* Step-1: Prepare the scalar float compare register. */
4913 rtx fp_reg
= gen_reg_rtx (GET_MODE_INNER (vec_fp_mode
));
4914 emit_insn (gen_move_insn (fp_reg
, fp_scalar
));
4916 /* Step-2: Generate the mask. */
4917 machine_mode mask_mode
= get_mask_mode (vec_fp_mode
);
4918 rtx mask
= gen_reg_rtx (mask_mode
);
4919 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, fp_vector
, fp_reg
);
4920 rtx cmp_ops
[] = {mask
, cmp
, fp_vector
, fp_reg
};
4921 insn_code icode
= code_for_pred_cmp_scalar (vec_fp_mode
);
4922 emit_vlmax_insn (icode
, COMPARE_OP
, cmp_ops
);
4928 emit_vec_copysign (rtx op_dest
, rtx op_src_0
, rtx op_src_1
,
4929 machine_mode vec_mode
)
4931 rtx sgnj_ops
[] = {op_dest
, op_src_0
, op_src_1
};
4932 insn_code icode
= code_for_pred (UNSPEC_VCOPYSIGN
, vec_mode
);
4934 emit_vlmax_insn (icode
, BINARY_OP
, sgnj_ops
);
4938 emit_vec_abs (rtx op_dest
, rtx op_src
, machine_mode vec_mode
)
4940 rtx abs_ops
[] = {op_dest
, op_src
};
4941 insn_code icode
= code_for_pred (ABS
, vec_mode
);
4943 emit_vlmax_insn (icode
, UNARY_OP
, abs_ops
);
4947 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, rtx mask
,
4948 insn_type type
, machine_mode vec_mode
)
4950 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4952 if (type
& USE_VUNDEF_MERGE_P
)
4954 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
4955 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4959 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4960 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4965 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4966 machine_mode vec_mode
)
4968 rtx ops
[] = {op_dest
, op_src
};
4969 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4971 emit_vlmax_insn (icode
, type
, ops
);
4975 emit_vec_narrow_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4976 machine_mode vec_mode
)
4978 rtx ops
[] = {op_dest
, op_src
};
4979 insn_code icode
= code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4981 emit_vlmax_insn (icode
, type
, ops
);
4985 emit_vec_widen_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4986 machine_mode vec_mode
)
4988 rtx ops
[] = {op_dest
, op_src
};
4989 insn_code icode
= code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4991 emit_vlmax_insn (icode
, type
, ops
);
4995 emit_vec_widen_cvt_f_f (rtx op_dest
, rtx op_src
, insn_type type
,
4996 machine_mode vec_mode
)
4998 rtx ops
[] = {op_dest
, op_src
};
4999 insn_code icode
= code_for_pred_extend (vec_mode
);
5001 emit_vlmax_insn (icode
, type
, ops
);
5005 emit_vec_cvt_f_x (rtx op_dest
, rtx op_src
, rtx mask
,
5006 insn_type type
, machine_mode vec_mode
)
5008 rtx cvt_fp_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
5009 insn_code icode
= code_for_pred (FLOAT
, vec_mode
);
5011 emit_vlmax_insn (icode
, type
, cvt_fp_ops
);
5015 emit_vec_cvt_x_f_rtz (rtx op_dest
, rtx op_src
, rtx mask
,
5016 insn_type type
, machine_mode vec_mode
)
5018 insn_code icode
= code_for_pred (FIX
, vec_mode
);
5020 if (type
& USE_VUNDEF_MERGE_P
)
5022 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
5023 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
5027 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
5028 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
5033 emit_vec_binary_alu (rtx op_dest
, rtx op_1
, rtx op_2
, enum rtx_code rcode
,
5034 machine_mode vec_mode
)
5036 rtx ops
[] = {op_dest
, op_1
, op_2
};
5037 insn_code icode
= code_for_pred (rcode
, vec_mode
);
5039 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
5043 expand_vec_ceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5044 machine_mode vec_int_mode
)
5046 /* Step-1: Get the abs float value for mask generation. */
5047 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5049 /* Step-2: Generate the mask on const fp. */
5050 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5051 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5053 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
5054 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5055 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RUP
, vec_fp_mode
);
5057 /* Step-4: Convert to floating-point on mask for the final result.
5058 To avoid unnecessary frm register access, we use RUP here and it will
5059 never do the rounding up because the tmp rtx comes from the float
5060 to int conversion. */
5061 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RUP
, vec_fp_mode
);
5063 /* Step-5: Retrieve the sign bit for -0.0. */
5064 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5068 expand_vec_floor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5069 machine_mode vec_int_mode
)
5071 /* Step-1: Get the abs float value for mask generation. */
5072 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5074 /* Step-2: Generate the mask on const fp. */
5075 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5076 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5078 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
5079 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5080 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RDN
, vec_fp_mode
);
5082 /* Step-4: Convert to floating-point on mask for the floor result. */
5083 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RDN
, vec_fp_mode
);
5085 /* Step-5: Retrieve the sign bit for -0.0. */
5086 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5090 expand_vec_nearbyint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5091 machine_mode vec_int_mode
)
5093 /* Step-1: Get the abs float value for mask generation. */
5094 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5096 /* Step-2: Generate the mask on const fp. */
5097 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5098 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5100 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
5101 rtx fflags
= gen_reg_rtx (SImode
);
5102 emit_insn (gen_riscv_frflags (fflags
));
5104 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
5105 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5106 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
5108 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
5109 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
5111 /* Step-6: Restore FP exception flags. */
5112 emit_insn (gen_riscv_fsflags (fflags
));
5114 /* Step-7: Retrieve the sign bit for -0.0. */
5115 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5119 expand_vec_rint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5120 machine_mode vec_int_mode
)
5122 /* Step-1: Get the abs float value for mask generation. */
5123 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5125 /* Step-2: Generate the mask on const fp. */
5126 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5127 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5129 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
5130 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5131 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
5133 /* Step-4: Convert to floating-point on mask for the rint result. */
5134 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
5136 /* Step-5: Retrieve the sign bit for -0.0. */
5137 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5141 expand_vec_round (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5142 machine_mode vec_int_mode
)
5144 /* Step-1: Get the abs float value for mask generation. */
5145 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5147 /* Step-2: Generate the mask on const fp. */
5148 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5149 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5151 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
5152 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5153 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RMM
, vec_fp_mode
);
5155 /* Step-4: Convert to floating-point on mask for the round result. */
5156 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RMM
, vec_fp_mode
);
5158 /* Step-5: Retrieve the sign bit for -0.0. */
5159 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5163 expand_vec_trunc (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5164 machine_mode vec_int_mode
)
5166 /* Step-1: Get the abs float value for mask generation. */
5167 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5169 /* Step-2: Generate the mask on const fp. */
5170 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5171 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5173 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
5174 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5175 emit_vec_cvt_x_f_rtz (tmp
, op_1
, mask
, UNARY_OP_TAMA
, vec_fp_mode
);
5177 /* Step-4: Convert to floating-point on mask for the rint result. */
5178 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
5180 /* Step-5: Retrieve the sign bit for -0.0. */
5181 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5185 expand_vec_roundeven (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5186 machine_mode vec_int_mode
)
5188 /* Step-1: Get the abs float value for mask generation. */
5189 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
5191 /* Step-2: Generate the mask on const fp. */
5192 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
5193 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
5195 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
5196 rtx tmp
= gen_reg_rtx (vec_int_mode
);
5197 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RNE
, vec_fp_mode
);
5199 /* Step-4: Convert to floating-point on mask for the rint result. */
5200 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RNE
, vec_fp_mode
);
5202 /* Step-5: Retrieve the sign bit for -0.0. */
5203 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
5206 /* Handling the rounding from floating-point to int/long/long long. */
5208 emit_vec_rounding_to_integer (rtx op_0
, rtx op_1
, insn_type type
,
5209 machine_mode vec_fp_mode
,
5210 machine_mode vec_int_mode
,
5211 machine_mode vec_bridge_mode
= E_VOIDmode
)
5213 poly_uint16 vec_fp_size
= GET_MODE_SIZE (vec_fp_mode
);
5214 poly_uint16 vec_int_size
= GET_MODE_SIZE (vec_int_mode
);
5216 if (known_eq (vec_fp_size
, vec_int_size
)) /* SF => SI, DF => DI. */
5217 emit_vec_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
5218 else if (maybe_eq (vec_fp_size
, vec_int_size
* 2)) /* DF => SI. */
5219 emit_vec_narrow_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
5220 else if (maybe_eq (vec_fp_size
* 2, vec_int_size
)) /* SF => DI, HF => SI. */
5221 emit_vec_widen_cvt_x_f (op_0
, op_1
, type
, vec_int_mode
);
5222 else if (maybe_eq (vec_fp_size
* 4, vec_int_size
)) /* HF => DI. */
5224 gcc_assert (vec_bridge_mode
!= E_VOIDmode
);
5226 rtx op_sf
= gen_reg_rtx (vec_bridge_mode
);
5228 /* Step-1: HF => SF, no rounding here. */
5229 emit_vec_widen_cvt_f_f (op_sf
, op_1
, UNARY_OP
, vec_bridge_mode
);
5230 /* Step-2: SF => DI. */
5231 emit_vec_widen_cvt_x_f (op_0
, op_sf
, type
, vec_int_mode
);
5238 expand_vec_lrint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5239 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
5241 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_DYN
, vec_fp_mode
,
5242 vec_int_mode
, vec_bridge_mode
);
5246 expand_vec_lround (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5247 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
5249 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RMM
, vec_fp_mode
,
5250 vec_int_mode
, vec_bridge_mode
);
5254 expand_vec_lceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5255 machine_mode vec_int_mode
)
5257 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RUP
, vec_fp_mode
,
5262 expand_vec_lfloor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
5263 machine_mode vec_int_mode
)
5265 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RDN
, vec_fp_mode
,
5269 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
5270 the vector fixed point vector single-width saturating add directly. */
5273 expand_vec_usadd (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
5275 emit_vec_binary_alu (op_0
, op_1
, op_2
, US_PLUS
, vec_mode
);
5278 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
5279 the vector fixed point vector single-width saturating add directly. */
5282 expand_vec_ssadd (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
5284 emit_vec_binary_alu (op_0
, op_1
, op_2
, SS_PLUS
, vec_mode
);
5287 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
5288 the vector fixed point vector single-width saturating add directly. */
5291 expand_vec_ussub (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
5293 emit_vec_binary_alu (op_0
, op_1
, op_2
, US_MINUS
, vec_mode
);
5296 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
5297 the vector fixed point vector single-width saturating add directly. */
5300 expand_vec_sssub (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
5302 emit_vec_binary_alu (op_0
, op_1
, op_2
, SS_MINUS
, vec_mode
);
5305 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5306 DI => SI. we can leverage the vector fixed point vector narrowing
5307 fixed-point clip directly. */
5310 expand_vec_double_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
)
5313 rtx zero
= CONST0_RTX (Xmode
);
5314 enum unspec unspec
= UNSPEC_VNCLIPU
;
5315 rtx ops
[] = {op_0
, op_1
, zero
};
5317 icode
= code_for_pred_narrow_clip_scalar (unspec
, vec_mode
);
5318 emit_vlmax_insn (icode
, BINARY_OP_VXRM_RNU
, ops
);
5321 /* Expand the standard name sstrunc<m><n>2 for double vector mode, like
5322 DI => SI. we can leverage the vector fixed point vector narrowing
5323 fixed-point clip directly. */
5326 expand_vec_double_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
)
5329 rtx zero
= CONST0_RTX (Xmode
);
5330 enum unspec unspec
= UNSPEC_VNCLIP
;
5331 rtx ops
[] = {op_0
, op_1
, zero
};
5333 icode
= code_for_pred_narrow_clip_scalar (unspec
, vec_mode
);
5334 emit_vlmax_insn (icode
, BINARY_OP_VXRM_RNU
, ops
);
5337 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5338 DI => HI. we can leverage the vector fixed point vector narrowing
5339 fixed-point clip directly. */
5342 expand_vec_quad_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5343 machine_mode double_mode
)
5345 rtx double_rtx
= gen_reg_rtx (double_mode
);
5347 expand_vec_double_ustrunc (double_rtx
, op_1
, vec_mode
);
5348 expand_vec_double_ustrunc (op_0
, double_rtx
, double_mode
);
5351 /* Expand the standard name sstrunc<m><n>2 for quad vector mode, like
5352 DI => HI. we can leverage the vector fixed point vector narrowing
5353 fixed-point clip directly. */
5356 expand_vec_quad_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5357 machine_mode double_mode
)
5359 rtx double_rtx
= gen_reg_rtx (double_mode
);
5361 expand_vec_double_sstrunc (double_rtx
, op_1
, vec_mode
);
5362 expand_vec_double_sstrunc (op_0
, double_rtx
, double_mode
);
5365 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5366 DI => QI. we can leverage the vector fixed point vector narrowing
5367 fixed-point clip directly. */
5370 expand_vec_oct_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5371 machine_mode double_mode
, machine_mode quad_mode
)
5373 rtx double_rtx
= gen_reg_rtx (double_mode
);
5374 rtx quad_rtx
= gen_reg_rtx (quad_mode
);
5376 expand_vec_double_ustrunc (double_rtx
, op_1
, vec_mode
);
5377 expand_vec_double_ustrunc (quad_rtx
, double_rtx
, double_mode
);
5378 expand_vec_double_ustrunc (op_0
, quad_rtx
, quad_mode
);
5381 /* Expand the standard name sstrunc<m><n>2 for oct vector mode, like
5382 DI => QI. we can leverage the vector fixed point vector narrowing
5383 fixed-point clip directly. */
5386 expand_vec_oct_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5387 machine_mode double_mode
, machine_mode quad_mode
)
5389 rtx double_rtx
= gen_reg_rtx (double_mode
);
5390 rtx quad_rtx
= gen_reg_rtx (quad_mode
);
5392 expand_vec_double_sstrunc (double_rtx
, op_1
, vec_mode
);
5393 expand_vec_double_sstrunc (quad_rtx
, double_rtx
, double_mode
);
5394 expand_vec_double_sstrunc (op_0
, quad_rtx
, quad_mode
);
5397 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5400 expand_popcount (rtx
*ops
)
5404 machine_mode mode
= GET_MODE (dst
);
5405 scalar_mode imode
= GET_MODE_INNER (mode
);
5406 static const uint64_t m5
= 0x5555555555555555ULL
;
5407 static const uint64_t m3
= 0x3333333333333333ULL
;
5408 static const uint64_t mf
= 0x0F0F0F0F0F0F0F0FULL
;
5409 static const uint64_t m1
= 0x0101010101010101ULL
;
5411 rtx x1
= gen_reg_rtx (mode
);
5412 rtx x2
= gen_reg_rtx (mode
);
5413 rtx x3
= gen_reg_rtx (mode
);
5414 rtx x4
= gen_reg_rtx (mode
);
5416 /* x1 = src - (src >> 1) & 0x555...); */
5417 rtx shift1
= expand_binop (mode
, lshr_optab
, src
, GEN_INT (1), NULL
, true,
5420 rtx and1
= gen_reg_rtx (mode
);
5421 rtx ops1
[] = {and1
, shift1
, gen_int_mode (m5
, imode
)};
5422 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5425 x1
= expand_binop (mode
, sub_optab
, src
, and1
, NULL
, true, OPTAB_DIRECT
);
5427 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5429 rtx and2
= gen_reg_rtx (mode
);
5430 rtx ops2
[] = {and2
, x1
, gen_int_mode (m3
, imode
)};
5431 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5434 rtx shift2
= expand_binop (mode
, lshr_optab
, x1
, GEN_INT (2), NULL
, true,
5437 rtx and22
= gen_reg_rtx (mode
);
5438 rtx ops22
[] = {and22
, shift2
, gen_int_mode (m3
, imode
)};
5439 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5442 x2
= expand_binop (mode
, add_optab
, and2
, and22
, NULL
, true, OPTAB_DIRECT
);
5444 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
5445 rtx shift3
= expand_binop (mode
, lshr_optab
, x2
, GEN_INT (4), NULL
, true,
5449 = expand_binop (mode
, add_optab
, x2
, shift3
, NULL
, true, OPTAB_DIRECT
);
5451 rtx ops3
[] = {x3
, plus3
, gen_int_mode (mf
, imode
)};
5452 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5455 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
5456 rtx mul4
= gen_reg_rtx (mode
);
5457 rtx ops4
[] = {mul4
, x3
, gen_int_mode (m1
, imode
)};
5458 emit_vlmax_insn (code_for_pred_scalar (MULT
, mode
), riscv_vector::BINARY_OP
,
5461 x4
= expand_binop (mode
, lshr_optab
, mul4
,
5462 GEN_INT (GET_MODE_BITSIZE (imode
) - 8), NULL
, true,
5465 emit_move_insn (dst
, x4
);
5468 /* Return true if it is VLMAX AVL TYPE. */
5470 vlmax_avl_type_p (rtx_insn
*rinsn
)
5472 extract_insn_cached (rinsn
);
5473 int index
= get_attr_avl_type_idx (rinsn
);
5474 if (index
== INVALID_ATTRIBUTE
)
5477 gcc_assert (index
< recog_data
.n_operands
);
5479 rtx avl_type
= recog_data
.operand
[index
];
5480 return INTVAL (avl_type
) == VLMAX
;
5483 /* Return true if it is an RVV instruction depends on VL global
5486 has_vl_op (rtx_insn
*rinsn
)
5488 return recog_memoized (rinsn
) >= 0 && get_attr_has_vl_op (rinsn
);
5491 /* Get default tail policy. */
5495 /* For the instruction that doesn't require TA, we still need a default value
5496 to emit vsetvl. We pick up the default value according to prefer policy. */
5497 return (bool) (get_prefer_tail_policy () & 0x1
5498 || (get_prefer_tail_policy () >> 1 & 0x1));
5501 /* Helper function to get TA operand. */
5503 tail_agnostic_p (rtx_insn
*rinsn
)
5505 /* If it doesn't have TA, we return agnostic by default. */
5506 extract_insn_cached (rinsn
);
5507 int ta
= get_attr_ta (rinsn
);
5508 return ta
== INVALID_ATTRIBUTE
? get_default_ta () : IS_AGNOSTIC (ta
);
5511 /* Change insn and Assert the change always happens. */
5513 validate_change_or_fail (rtx object
, rtx
*loc
, rtx new_rtx
, bool in_group
)
5515 bool change_p
= validate_change (object
, loc
, new_rtx
, in_group
);
5516 gcc_assert (change_p
);
5519 /* Return true if it is NONVLMAX AVL TYPE. */
5521 nonvlmax_avl_type_p (rtx_insn
*rinsn
)
5523 extract_insn_cached (rinsn
);
5524 int index
= get_attr_avl_type_idx (rinsn
);
5525 if (index
== INVALID_ATTRIBUTE
)
5528 gcc_assert (index
< recog_data
.n_operands
);
5530 rtx avl_type
= recog_data
.operand
[index
];
5531 return INTVAL (avl_type
) == NONVLMAX
;
5534 /* Return true if RTX is RVV VLMAX AVL. */
5538 return x
&& rtx_equal_p (x
, RVV_VLMAX
);
5541 /* Helper function to get SEW operand. We always have SEW value for
5542 all RVV instructions that have VTYPE OP. */
5544 get_sew (rtx_insn
*rinsn
)
5546 return get_attr_sew (rinsn
);
5549 /* Helper function to get VLMUL operand. We always have VLMUL value for
5550 all RVV instructions that have VTYPE OP. */
5552 get_vlmul (rtx_insn
*rinsn
)
5554 return (enum vlmul_type
) get_attr_vlmul (rinsn
);
5557 /* Count the number of REGNO in RINSN. */
5559 count_regno_occurrences (rtx_insn
*rinsn
, unsigned int regno
)
5562 extract_insn (rinsn
);
5563 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
5564 if (refers_to_regno_p (regno
, recog_data
.operand
[i
]))
5569 /* Return true if the OP can be directly broadcasted. */
5571 can_be_broadcasted_p (rtx op
)
5573 machine_mode mode
= GET_MODE (op
);
5574 /* We don't allow RA (register allocation) reload generate
5575 (vec_duplicate:DI reg) in RV32 system wheras we allow
5576 (vec_duplicate:DI mem) in RV32 system. */
5577 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode
)
5578 && maybe_gt (GET_MODE_SIZE (mode
), GET_MODE_SIZE (Pmode
))
5579 && !satisfies_constraint_Wdm (op
))
5582 if (satisfies_constraint_K (op
) || register_operand (op
, mode
)
5583 || satisfies_constraint_Wdm (op
) || rtx_equal_p (op
, CONST0_RTX (mode
)))
5586 return can_create_pseudo_p () && nonmemory_operand (op
, mode
);
5590 emit_vec_extract (rtx target
, rtx src
, rtx index
)
5592 machine_mode vmode
= GET_MODE (src
);
5593 machine_mode smode
= GET_MODE (target
);
5594 class expand_operand ops
[3];
5595 enum insn_code icode
5596 = convert_optab_handler (vec_extract_optab
, vmode
, smode
);
5597 gcc_assert (icode
!= CODE_FOR_nothing
);
5598 create_output_operand (&ops
[0], target
, smode
);
5600 create_input_operand (&ops
[1], src
, vmode
);
5603 if (poly_int_rtx_p (index
, &val
))
5604 create_integer_operand (&ops
[2], val
);
5606 create_input_operand (&ops
[2], index
, Pmode
);
5608 expand_insn (icode
, 3, ops
);
5609 if (ops
[0].value
!= target
)
5610 emit_move_insn (target
, ops
[0].value
);
5613 /* Return true if the offset mode is valid mode that we use for gather/scatter
5614 autovectorization. */
5616 gather_scatter_valid_offset_p (machine_mode mode
)
5618 /* If the element size of offset mode is already >= Pmode size,
5619 we don't need any extensions. */
5620 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode
)), UNITS_PER_WORD
))
5623 /* Since we are very likely extend the offset mode into vector Pmode,
5624 Disable gather/scatter autovectorization if we can't extend the offset
5625 mode into vector Pmode. */
5626 if (!get_vector_mode (Pmode
, GET_MODE_NUNITS (mode
)).exists ())
5631 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5632 Look into the tuning structure for an estimate.
5633 KIND specifies the type of requested estimate: min, max or likely.
5634 For cores with a known VLA width all three estimates are the same.
5635 For generic VLA tuning we want to distinguish the maximum estimate from
5636 the minimum and likely ones.
5637 The likely estimate is the same as the minimum in that case to give a
5638 conservative behavior of auto-vectorizing with VLA when it is a win
5639 even for VLA vectorization.
5640 When VLA width information is available VAL.coeffs[1] is multiplied by
5641 the number of VLA chunks over the initial VLS bits. */
5643 estimated_poly_value (poly_int64 val
, unsigned int kind
)
5645 unsigned int width_source
5646 = BITS_PER_RISCV_VECTOR
.is_constant ()
5647 ? (unsigned int) BITS_PER_RISCV_VECTOR
.to_constant ()
5648 : (unsigned int) RVV_VECTOR_BITS_SCALABLE
;
5650 /* If there is no core-specific information then the minimum and likely
5651 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5652 the architectural maximum of 65536 bits. */
5653 unsigned int min_vlen_bytes
= TARGET_MIN_VLEN
/ 8 - 1;
5654 if (width_source
== RVV_VECTOR_BITS_SCALABLE
)
5657 case POLY_VALUE_MIN
:
5658 case POLY_VALUE_LIKELY
:
5659 return val
.coeffs
[0];
5661 case POLY_VALUE_MAX
:
5662 return val
.coeffs
[0] + val
.coeffs
[1] * min_vlen_bytes
;
5665 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5666 lowest as likely. This could be made more general if future -mtune
5667 options need it to be. */
5668 if (kind
== POLY_VALUE_MAX
)
5669 width_source
= 1 << floor_log2 (width_source
);
5671 width_source
= least_bit_hwi (width_source
);
5673 /* If the core provides width information, use that. */
5674 HOST_WIDE_INT over_min_vlen
= width_source
- TARGET_MIN_VLEN
;
5675 return val
.coeffs
[0] + val
.coeffs
[1] * over_min_vlen
/ TARGET_MIN_VLEN
;
5678 /* Return true it is whole register-register move. */
5680 whole_reg_to_reg_move_p (rtx
*ops
, machine_mode mode
, int avl_type_index
)
5682 /* An operation is a whole-register move if either
5683 (1) Its vlmax operand equals VLMAX
5684 (2) Its vl operand equals the number of units of its mode. */
5685 if (register_operand (ops
[0], mode
)
5686 && register_operand (ops
[3], mode
)
5687 && satisfies_constraint_vu (ops
[2])
5688 && satisfies_constraint_Wc1 (ops
[1]))
5690 if (INTVAL (ops
[avl_type_index
]) == VLMAX
)
5692 /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5693 into NON-VLMAX with LEN = NUNITS. */
5694 else if (CONST_INT_P (ops
[4])
5695 && known_eq (INTVAL (ops
[4]), GET_MODE_NUNITS (mode
)))
5701 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
5703 splat_to_scalar_move_p (rtx
*ops
)
5705 return satisfies_constraint_Wc1 (ops
[1])
5706 && satisfies_constraint_vu (ops
[2])
5708 && satisfies_constraint_k01 (ops
[4])
5709 && INTVAL (ops
[7]) == NONVLMAX
5710 && known_ge (GET_MODE_SIZE (Pmode
), GET_MODE_SIZE (GET_MODE (ops
[3])));
5713 } // namespace riscv_vector