1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2025 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
26 #include "coretypes.h"
31 #include "basic-block.h"
34 #include "targhooks.h"
36 #include "fold-const.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-hash-traits.h"
47 /* This file should be included last. */
48 #include "riscv-vector-costs.h"
50 namespace riscv_vector
{
52 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
55 - Collect all vectorize STMTs locally for each loop block.
56 - Build program point based graph, ignore non-vectorize STMTs:
58 vectorize STMT 0 - point 0
59 scalar STMT 0 - ignore.
60 vectorize STMT 1 - point 1
62 - Compute the number of live V_REGs live at each program point
63 - Determine LMUL in VECTOR COST model according to the program point
64 which has maximum live V_REGs.
68 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
69 It's important for mixed size auto-vectorization (Conversions, ... etc).
70 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
71 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
72 We compute the number live V_REGs at each program point according to
74 - We only compute program points and live ranges locally (within a block)
75 since we just need to compute the number of live V_REGs at each program
76 point and we are not really allocating the registers for each SSA.
77 We can make the variable has another local live range in another block
78 if it live out/live in to another block. Such approach doesn't affect
79 out accurate live range analysis.
80 - Current analysis didn't consider any instruction scheduling which
81 may improve the register pressure. So we are conservatively doing the
82 analysis which may end up with smaller LMUL.
83 TODO: Maybe we could support a reasonable live range shrink algorithm
84 which take advantage of instruction scheduling.
85 - We may have these following possible autovec modes analysis:
87 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
88 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
89 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
93 is_gimple_assign_or_call (gimple
*stmt
)
95 return is_gimple_assign (stmt
) || is_gimple_call (stmt
);
98 /* Return the program point of 1st vectorized lanes statement. */
100 get_first_lane_point (const vec
<stmt_point
> program_points
,
101 stmt_vec_info stmt_info
)
103 for (const auto program_point
: program_points
)
104 if (program_point
.stmt_info
== DR_GROUP_FIRST_ELEMENT (stmt_info
))
105 return program_point
.point
;
109 /* Return the program point of last vectorized lanes statement. */
111 get_last_lane_point (const vec
<stmt_point
> program_points
,
112 stmt_vec_info stmt_info
)
114 unsigned int max_point
= 0;
115 for (auto s
= DR_GROUP_FIRST_ELEMENT (stmt_info
); s
!= NULL
;
116 s
= DR_GROUP_NEXT_ELEMENT (s
))
118 for (const auto program_point
: program_points
)
119 if (program_point
.stmt_info
== s
&& program_point
.point
> max_point
)
120 max_point
= program_point
.point
;
125 /* Return the last variable that is in the live range list. */
127 get_live_range (hash_map
<tree
, pair
> *live_ranges
, tree arg
)
129 auto *r
= live_ranges
->get (arg
);
135 gimple
*def_stmt
= NULL
;
136 while (t
&& TREE_CODE (t
) == SSA_NAME
&& !r
137 && (def_stmt
= SSA_NAME_DEF_STMT (t
)))
139 if (gimple_assign_cast_p (def_stmt
))
141 t
= gimple_assign_rhs1 (def_stmt
);
142 r
= live_ranges
->get (t
);
146 /* FIXME: Currently we don't see any fold for
147 non-conversion statements. */
154 bool insert_p
= live_ranges
->put (arg
, pair (0, 0));
155 gcc_assert (!insert_p
);
156 return live_ranges
->get (arg
);
161 /* Collect all STMTs that are vectorized and compute their program points.
162 Note that we don't care about the STMTs that are not vectorized and
163 we only build the local graph (within a block) of program points.
167 STMT 1 (be vectorized) -- point 0
168 STMT 2 (not be vectorized) -- ignored
169 STMT 3 (be vectorized) -- point 1
170 STMT 4 (be vectorized) -- point 2
171 STMT 5 (be vectorized) -- point 3
174 STMT 1 (be vectorized) -- point 0
175 STMT 2 (be vectorized) -- point 1
176 STMT 3 (not be vectorized) -- ignored
177 STMT 4 (not be vectorized) -- ignored
178 STMT 5 (be vectorized) -- point 2
182 compute_local_program_points (
184 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
)
186 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
188 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
189 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
190 unsigned int nbbs
= loop
->num_nodes
;
191 gimple_stmt_iterator si
;
193 /* Collect the stmts that is vectorized and mark their program point. */
194 for (i
= 0; i
< nbbs
; i
++)
196 unsigned int point
= 1;
197 basic_block bb
= bbs
[i
];
198 vec
<stmt_point
> program_points
= vNULL
;
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_NOTE
, vect_location
,
201 "Compute local program points for bb %d:\n",
203 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
205 if (!is_gimple_assign_or_call (gsi_stmt (si
)))
207 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
208 enum stmt_vec_info_type type
209 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
210 if (type
!= undef_vec_info_type
)
212 stmt_point info
= {point
, gsi_stmt (si
), stmt_info
};
213 program_points
.safe_push (info
);
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE
, vect_location
,
217 "program point %d: %G", info
.point
,
221 program_points_per_bb
.put (bb
, program_points
);
227 get_biggest_mode (machine_mode mode1
, machine_mode mode2
)
229 unsigned int mode1_size
= GET_MODE_BITSIZE (mode1
).to_constant ();
230 unsigned int mode2_size
= GET_MODE_BITSIZE (mode2
).to_constant ();
231 return mode1_size
>= mode2_size
? mode1
: mode2
;
234 /* Return true if OP is invariant. */
237 loop_invariant_op_p (class loop
*loop
,
240 if (is_gimple_constant (op
))
242 if (SSA_NAME_IS_DEFAULT_DEF (op
)
243 || !flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (op
))))
248 /* Return true if the variable should be counted into liveness. */
250 variable_vectorized_p (class loop
*loop
, stmt_vec_info stmt_info
, tree var
,
255 gimple
*stmt
= STMT_VINFO_STMT (stmt_info
);
256 enum stmt_vec_info_type type
257 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
258 if (is_gimple_call (stmt
) && gimple_call_internal_p (stmt
))
260 if (gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
261 || gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
)
263 /* .MASK_LOAD (_5, 32B, _33)
265 Only the 3rd argument will be vectorized and consume
266 a vector register. */
267 if (TREE_CODE (TREE_TYPE (var
)) == BOOLEAN_TYPE
268 || (is_gimple_reg (var
) && !POINTER_TYPE_P (TREE_TYPE (var
))))
274 else if (is_gimple_assign (stmt
))
276 tree_code tcode
= gimple_assign_rhs_code (stmt
);
277 /* vi variant doesn't need to allocate such statement.
278 E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi
279 so the INTEGER_CST '1' doesn't need a vector register. */
286 return TREE_CODE (var
) != INTEGER_CST
287 || !tree_fits_shwi_p (var
)
288 || !IN_RANGE (tree_to_shwi (var
), -16, 15);
290 return TREE_CODE (var
) != INTEGER_CST
291 || !tree_fits_shwi_p (var
)
292 || !IN_RANGE (tree_to_shwi (var
), -16, 15)
293 || gimple_assign_rhs1 (stmt
) != var
;
296 return gimple_assign_rhs2 (stmt
) != var
297 || !loop_invariant_op_p (loop
, var
);
304 return is_gimple_reg (var
)
305 && (!POINTER_TYPE_P (TREE_TYPE (var
))
306 || type
!= store_vec_info_type
);
308 return poly_int_tree_p (var
)
309 || (is_gimple_val (var
)
310 && (!POINTER_TYPE_P (TREE_TYPE (var
))
311 || type
!= load_vec_info_type
));
314 /* Compute local live ranges of each vectorized variable.
315 Note that we only compute local live ranges (within a block) since
316 local live ranges information is accurate enough for us to determine
317 the LMUL/vectorization factor of the loop.
322 STMT 2 (def SSA 1) -- point 1
323 STMT 3 (use SSA 1) -- point 2
329 STMT 4 (use SSA 2) -- point 3
331 The live range of SSA 1 is [1, 3] in bb 2.
332 The live range of SSA 2 is [0, 4] in bb 3. */
334 compute_local_live_ranges (
335 loop_vec_info loop_vinfo
,
336 const hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
337 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
)
339 machine_mode biggest_mode
= QImode
;
340 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
341 if (!program_points_per_bb
.is_empty ())
343 auto_vec
<tree
> visited_vars
;
345 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
346 = program_points_per_bb
.begin ();
347 iter
!= program_points_per_bb
.end (); ++iter
)
349 basic_block bb
= (*iter
).first
;
350 vec
<stmt_point
> program_points
= (*iter
).second
;
351 bool existed_p
= false;
352 hash_map
<tree
, pair
> *live_ranges
353 = &live_ranges_per_bb
.get_or_insert (bb
, &existed_p
);
354 gcc_assert (!existed_p
);
355 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE
, vect_location
,
357 "Compute local live ranges for bb %d:\n",
359 for (const auto program_point
: program_points
)
361 unsigned int point
= program_point
.point
;
362 gimple
*stmt
= program_point
.stmt
;
363 tree lhs
= gimple_get_lhs (stmt
);
364 if (variable_vectorized_p (loop
, program_point
.stmt_info
, lhs
,
367 biggest_mode
= get_biggest_mode (biggest_mode
,
368 TYPE_MODE (TREE_TYPE (lhs
)));
369 bool existed_p
= false;
371 = live_ranges
->get_or_insert (lhs
, &existed_p
);
372 gcc_assert (!existed_p
);
373 if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point
.stmt_info
)
374 == VMAT_LOAD_STORE_LANES
)
375 point
= get_first_lane_point (program_points
,
376 program_point
.stmt_info
);
377 live_range
= pair (point
, point
);
379 for (i
= 0; i
< gimple_num_args (stmt
); i
++)
381 tree var
= gimple_arg (stmt
, i
);
382 if (variable_vectorized_p (loop
, program_point
.stmt_info
, var
,
386 = get_biggest_mode (biggest_mode
,
387 TYPE_MODE (TREE_TYPE (var
)));
388 bool existed_p
= false;
390 = live_ranges
->get_or_insert (var
, &existed_p
);
391 if (STMT_VINFO_MEMORY_ACCESS_TYPE (
392 program_point
.stmt_info
)
393 == VMAT_LOAD_STORE_LANES
)
394 point
= get_last_lane_point (program_points
,
395 program_point
.stmt_info
);
397 point
= MAX (live_range
.second
, point
);
399 /* We will grow the live range for each use. */
400 live_range
= pair (live_range
.first
, point
);
404 if (TREE_CODE (var
) == SSA_NAME
405 && (def_stmt
= SSA_NAME_DEF_STMT (var
))
406 && gimple_bb (def_stmt
) == bb
407 && is_gimple_assign_or_call (def_stmt
))
409 live_ranges
->remove (var
);
410 for (unsigned int j
= 0;
411 j
< gimple_num_args (def_stmt
); j
++)
413 tree arg
= gimple_arg (def_stmt
, j
);
414 auto *r
= get_live_range (live_ranges
, arg
);
416 (*r
).second
= MAX (point
, (*r
).second
);
417 biggest_mode
= get_biggest_mode (
418 biggest_mode
, TYPE_MODE (TREE_TYPE (arg
)));
422 /* The splat vector lives the whole block. */
423 live_range
= pair (0, program_points
.length ());
428 if (dump_enabled_p ())
429 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
->begin ();
430 iter
!= live_ranges
->end (); ++iter
)
431 dump_printf_loc (MSG_NOTE
, vect_location
,
432 "%T: type = %T, start = %d, end = %d\n",
433 (*iter
).first
, TREE_TYPE ((*iter
).first
),
434 (*iter
).second
.first
, (*iter
).second
.second
);
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_NOTE
, vect_location
, "Biggest mode = %s\n",
439 GET_MODE_NAME (biggest_mode
));
443 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
445 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
446 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
448 compute_nregs_for_mode (loop_vec_info loop_vinfo
, machine_mode mode
,
449 machine_mode biggest_mode
, int lmul
)
451 unsigned int rgroup_size
= LOOP_VINFO_LENS (loop_vinfo
).is_empty ()
453 : LOOP_VINFO_LENS (loop_vinfo
).length ();
454 unsigned int mode_size
= GET_MODE_SIZE (mode
).to_constant ();
455 unsigned int biggest_size
= GET_MODE_SIZE (biggest_mode
).to_constant ();
456 gcc_assert (biggest_size
>= mode_size
);
457 unsigned int ratio
= biggest_size
/ mode_size
;
458 /* RVV mask bool modes always consume 1 vector register regardless LMUL. */
459 unsigned int nregs
= mode
== BImode
? 1 : lmul
/ ratio
;
460 return MAX (nregs
, 1) * rgroup_size
;
463 /* This function helps to determine whether current LMUL will cause
464 potential vector register (V_REG) spillings according to live range
467 - First, compute how many variable are alive of each program point
468 in each bb of the loop.
469 - Second, compute how many V_REGs are alive of each program point
470 in each bb of the loop according the BIGGEST_MODE and the variable
472 - Third, Return the maximum V_REGs are alive of the loop. */
474 max_number_of_live_regs (loop_vec_info loop_vinfo
, const basic_block bb
,
475 const hash_map
<tree
, pair
> &live_ranges
,
476 unsigned int max_point
, machine_mode biggest_mode
,
479 unsigned int max_nregs
= 0;
481 unsigned int live_point
= 0;
482 auto_vec
<unsigned int> live_vars_vec
;
483 live_vars_vec
.safe_grow_cleared (max_point
, true);
484 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
.begin ();
485 iter
!= live_ranges
.end (); ++iter
)
487 tree var
= (*iter
).first
;
488 pair live_range
= (*iter
).second
;
489 for (i
= live_range
.first
+ 1; i
<= live_range
.second
; i
++)
492 if (TREE_CODE (TREE_TYPE (var
)) == BOOLEAN_TYPE
)
494 /* Constants do not have a mode, just use the biggest so
495 compute_nregs will return 1. */
496 else if (TREE_CODE (var
) == INTEGER_CST
)
499 mode
= TYPE_MODE (TREE_TYPE (var
));
501 = compute_nregs_for_mode (loop_vinfo
, mode
, biggest_mode
, lmul
);
502 live_vars_vec
[i
] += nregs
;
503 if (live_vars_vec
[i
] > max_nregs
)
505 max_nregs
= live_vars_vec
[i
];
511 /* Collect user explicit RVV type. */
512 auto_vec
<basic_block
> all_preds
513 = get_all_dominated_blocks (CDI_POST_DOMINATORS
, bb
);
515 FOR_EACH_SSA_NAME (i
, t
, cfun
)
517 machine_mode mode
= TYPE_MODE (TREE_TYPE (t
));
518 if (!lookup_vector_type_attribute (TREE_TYPE (t
))
519 && !riscv_v_ext_vls_mode_p (mode
))
522 gimple
*def
= SSA_NAME_DEF_STMT (t
);
523 if (gimple_bb (def
) && !all_preds
.contains (gimple_bb (def
)))
526 imm_use_iterator iterator
;
528 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, t
)
530 if (!USE_STMT (use_p
) || is_gimple_debug (USE_STMT (use_p
))
531 || !dominated_by_p (CDI_POST_DOMINATORS
, bb
,
532 gimple_bb (USE_STMT (use_p
))))
535 int regno_alignment
= riscv_get_v_regno_alignment (mode
);
536 max_nregs
+= regno_alignment
;
537 if (dump_enabled_p ())
539 MSG_NOTE
, vect_location
,
540 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
541 "V_REG live in bb %d at program point %d\n",
542 t
, TREE_TYPE (t
), GET_MODE_NAME (mode
), regno_alignment
,
543 bb
->index
, live_point
);
548 if (dump_enabled_p ())
550 MSG_NOTE
, vect_location
,
551 "Maximum lmul = %d, At most %d number of live V_REG at program "
552 "point %d for bb %d\n",
553 lmul
, max_nregs
, live_point
, bb
->index
);
557 /* Get STORE value. */
559 get_store_value (gimple
*stmt
)
561 if (is_gimple_call (stmt
) && gimple_call_internal_p (stmt
))
563 if (gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
564 return gimple_call_arg (stmt
, 3);
569 return gimple_assign_rhs1 (stmt
);
572 /* Return true if additional vector vars needed. */
574 need_additional_vector_vars_p (stmt_vec_info stmt_info
)
576 enum stmt_vec_info_type type
577 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
578 if (type
== load_vec_info_type
|| type
== store_vec_info_type
)
580 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
581 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
584 machine_mode mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
585 int lmul
= riscv_get_v_regno_alignment (mode
);
586 if (DR_GROUP_SIZE (stmt_info
) * lmul
> RVV_M8
)
592 /* Return the LMUL of the current analysis. */
594 compute_estimated_lmul (loop_vec_info loop_vinfo
, machine_mode mode
)
596 gcc_assert (GET_MODE_BITSIZE (mode
).is_constant ());
597 int regno_alignment
= riscv_get_v_regno_alignment (loop_vinfo
->vector_mode
);
598 if (riscv_v_ext_vls_mode_p (loop_vinfo
->vector_mode
))
599 return regno_alignment
;
600 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
), 1U))
602 int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
603 int estimated_lmul
= estimated_vf
* GET_MODE_BITSIZE (mode
).to_constant ()
605 if (estimated_lmul
> RVV_M8
)
606 return regno_alignment
;
608 return estimated_lmul
;
612 /* Estimate the VLA SLP LMUL. */
613 if (regno_alignment
> RVV_M1
)
614 return regno_alignment
;
615 else if (mode
!= QImode
616 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
).is_constant ())
619 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR
,
620 GET_MODE_SIZE (loop_vinfo
->vector_mode
), &ratio
))
632 /* Update the live ranges according PHI.
637 STMT 2 (def SSA 1) -- point 1
638 STMT 3 (use SSA 1) -- point 2
644 STMT 3 (use SSA 2) -- point 2
647 Before this function, the SSA 1 live range is [2, 3] in bb 2
648 and SSA 2 is [0, 3] in bb 3.
650 Then, after this function, we update SSA 1 live range in bb 2
651 into [2, 4] since SSA 1 is live out into bb 3. */
653 update_local_live_ranges (
655 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
656 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
,
657 machine_mode
*biggest_mode
)
659 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
663 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
664 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
665 unsigned int nbbs
= loop
->num_nodes
;
668 gimple_stmt_iterator si
;
669 for (i
= 0; i
< nbbs
; i
++)
671 basic_block bb
= bbs
[i
];
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE
, vect_location
,
674 "Update local program points for bb %d:\n",
676 for (psi
= gsi_start_phis (bb
); !gsi_end_p (psi
); gsi_next (&psi
))
678 gphi
*phi
= psi
.phi ();
679 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (phi
);
680 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
681 == undef_vec_info_type
)
684 for (j
= 0; j
< gimple_phi_num_args (phi
); j
++)
686 edge e
= gimple_phi_arg_edge (phi
, j
);
687 tree def
= gimple_phi_arg_def (phi
, j
);
688 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
689 auto *live_range
= live_ranges
->get (def
);
690 if (poly_int_tree_p (def
))
692 /* Insert live range of INTEGER_CST or POLY_CST since we will
693 need to allocate a vector register for it.
695 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
696 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
698 The live range for such value is short which only lives
699 from program point 0 to 1. */
702 unsigned int start
= (*live_range
).first
;
703 (*live_range
).first
= 0;
704 if (dump_enabled_p ())
706 MSG_NOTE
, vect_location
,
707 "Update %T start point from %d to 0:\n", def
, start
);
711 live_ranges
->put (def
, pair (0, 1));
712 auto &program_points
= (*program_points_per_bb
.get (bb
));
713 if (program_points
.is_empty ())
715 stmt_point info
= {1, phi
, stmt_info
};
716 program_points
.safe_push (info
);
718 if (dump_enabled_p ())
719 dump_printf_loc (MSG_NOTE
, vect_location
,
720 "Add %T start point from 0 to 1:\n",
725 if (live_range
&& flow_bb_inside_loop_p (loop
, e
->src
))
727 unsigned int start
= (*live_range
).first
;
728 (*live_range
).first
= 0;
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE
, vect_location
,
731 "Update %T start point from %d to %d:\n",
732 def
, start
, (*live_range
).first
);
734 live_ranges
= live_ranges_per_bb
.get (e
->src
);
735 if (!program_points_per_bb
.get (e
->src
))
737 unsigned int max_point
738 = (*program_points_per_bb
.get (e
->src
)).length ();
739 live_range
= live_ranges
->get (def
);
743 unsigned int end
= (*live_range
).second
;
744 (*live_range
).second
= max_point
;
745 if (dump_enabled_p ())
746 dump_printf_loc (MSG_NOTE
, vect_location
,
747 "Update %T end point from %d to %d:\n", def
,
748 end
, (*live_range
).second
);
751 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
753 if (!is_gimple_assign_or_call (gsi_stmt (si
)))
755 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
756 enum stmt_vec_info_type type
757 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
758 if (need_additional_vector_vars_p (stmt_info
))
760 /* For non-adjacent load/store STMT, we will potentially
763 1. MASK_LEN_GATHER_LOAD (..., perm indice).
764 2. Contiguous load/store + VEC_PERM (..., perm indice)
766 We will be likely using one more vector variable. */
767 unsigned int max_point
768 = (*program_points_per_bb
.get (bb
)).length ();
769 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
770 bool existed_p
= false;
771 tree var
= type
== load_vec_info_type
772 ? gimple_get_lhs (gsi_stmt (si
))
773 : get_store_value (gsi_stmt (si
));
774 tree sel_type
= build_nonstandard_integer_type (
775 TYPE_PRECISION (TREE_TYPE (var
)), 1);
777 = get_biggest_mode (*biggest_mode
, TYPE_MODE (sel_type
));
778 tree sel
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
779 get_identifier ("vect_perm"), sel_type
);
780 pair
&live_range
= live_ranges
->get_or_insert (sel
, &existed_p
);
781 gcc_assert (!existed_p
);
782 live_range
= pair (0, max_point
);
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE
, vect_location
,
785 "Add perm indice %T, start = 0, end = %d\n",
787 if (!LOOP_VINFO_LENS (loop_vinfo
).is_empty ()
788 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
790 /* If we are vectorizing a permutation when the rgroup number
791 > 1, we will need additional mask to shuffle the second
793 tree mask
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
794 get_identifier ("vect_perm_mask"),
797 = live_ranges
->get_or_insert (mask
, &existed_p
);
798 gcc_assert (!existed_p
);
799 live_range
= pair (0, max_point
);
800 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE
, vect_location
,
802 "Add perm mask %T, start = 0, end = %d\n",
810 /* Compute the maximum live V_REGS. */
812 has_unexpected_spills_p (loop_vec_info loop_vinfo
)
814 /* Compute local program points.
815 It's a fast and effective computation. */
816 hash_map
<basic_block
, vec
<stmt_point
>> program_points_per_bb
;
817 compute_local_program_points (loop_vinfo
, program_points_per_bb
);
819 /* Compute local live ranges. */
820 hash_map
<basic_block
, hash_map
<tree
, pair
>> live_ranges_per_bb
;
821 machine_mode biggest_mode
822 = compute_local_live_ranges (loop_vinfo
, program_points_per_bb
,
825 /* Update live ranges according to PHI. */
826 update_local_live_ranges (loop_vinfo
, program_points_per_bb
,
827 live_ranges_per_bb
, &biggest_mode
);
829 int lmul
= compute_estimated_lmul (loop_vinfo
, biggest_mode
);
830 gcc_assert (lmul
<= RVV_M8
);
831 /* TODO: We calculate the maximum live vars base on current STMTS
832 sequence. We can support live range shrink if it can give us
833 big improvement in the future. */
836 if (!live_ranges_per_bb
.is_empty ())
838 unsigned int max_nregs
= 0;
839 for (hash_map
<basic_block
, hash_map
<tree
, pair
>>::iterator iter
840 = live_ranges_per_bb
.begin ();
841 iter
!= live_ranges_per_bb
.end (); ++iter
)
843 basic_block bb
= (*iter
).first
;
844 unsigned int max_point
845 = (*program_points_per_bb
.get (bb
)).length () + 1;
846 if ((*iter
).second
.is_empty ())
848 /* We prefer larger LMUL unless it causes register spillings. */
850 = max_number_of_live_regs (loop_vinfo
, bb
, (*iter
).second
,
851 max_point
, biggest_mode
, lmul
);
852 if (nregs
> max_nregs
)
855 live_ranges_per_bb
.empty ();
856 if (max_nregs
> V_REG_NUM
)
860 if (!program_points_per_bb
.is_empty ())
862 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
863 = program_points_per_bb
.begin ();
864 iter
!= program_points_per_bb
.end (); ++iter
)
866 vec
<stmt_point
> program_points
= (*iter
).second
;
867 if (!program_points
.is_empty ())
868 program_points
.release ();
870 program_points_per_bb
.empty ();
875 costs::costs (vec_info
*vinfo
, bool costing_for_scalar
)
876 : vector_costs (vinfo
, costing_for_scalar
)
878 if (costing_for_scalar
)
879 m_cost_type
= SCALAR_COST
;
880 else if (riscv_v_ext_vector_mode_p (vinfo
->vector_mode
))
881 m_cost_type
= VLA_VECTOR_COST
;
883 m_cost_type
= VLS_VECTOR_COST
;
886 /* Do one-time initialization of the costs given that we're
887 costing the loop vectorization described by LOOP_VINFO. */
889 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
891 /* Detect whether we're vectorizing for VLA and should apply the unrolling
892 heuristic described above m_unrolled_vls_niters. */
893 record_potential_vls_unrolling (loop_vinfo
);
895 /* Detect whether the LOOP has unexpected spills. */
896 record_potential_unexpected_spills (loop_vinfo
);
899 /* Analyze the vectorized program statements and use dynamic LMUL
900 heuristic to detect whether the loop has unexpected spills. */
902 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo
)
904 /* We only want to apply the heuristic if LOOP_VINFO is being
905 vectorized for VLA and known NITERS VLS loop. */
906 if (rvv_max_lmul
== RVV_DYNAMIC
907 && (m_cost_type
== VLA_VECTOR_COST
908 || (m_cost_type
== VLS_VECTOR_COST
909 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))))
911 bool post_dom_available_p
= dom_info_available_p (CDI_POST_DOMINATORS
);
912 if (!post_dom_available_p
)
913 calculate_dominance_info (CDI_POST_DOMINATORS
);
914 m_has_unexpected_spills_p
= has_unexpected_spills_p (loop_vinfo
);
915 if (!post_dom_available_p
)
916 free_dominance_info (CDI_POST_DOMINATORS
);
920 /* Decide whether to use the unrolling heuristic described above
921 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
922 describes the loop that we're vectorizing. */
924 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo
)
926 /* We only want to apply the heuristic if LOOP_VINFO is being
927 vectorized for VLA. */
928 if (m_cost_type
!= VLA_VECTOR_COST
)
931 /* We don't want to apply the heuristic to outer loops, since it's
932 harder to track two levels of unrolling. */
933 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
936 /* Only handle cases in which the number of VLS iterations
937 would be known at compile time but the number of SVE iterations
939 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
940 || BYTES_PER_RISCV_VECTOR
.is_constant ())
943 /* Guess how many times the VLS loop would iterate and make
944 sure that it is within the complete unrolling limit. Even if the
945 number of iterations is small enough, the number of statements might
946 not be, which is why we need to estimate the number of statements too. */
947 unsigned int vls_vf
= vect_vf_for_cost (loop_vinfo
);
948 unsigned HOST_WIDE_INT unrolled_vls_niters
949 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / vls_vf
;
950 if (unrolled_vls_niters
> (unsigned int) param_max_completely_peel_times
)
953 /* Record that we're applying the heuristic and should try to estimate
954 the number of statements in the VLS loop. */
955 m_unrolled_vls_niters
= unrolled_vls_niters
;
958 /* Return true if (a) we're applying the VLS vs. VLA unrolling
959 heuristic described above m_unrolled_vls_niters and (b) the heuristic
960 says that we should prefer the VLS loop. */
962 costs::prefer_unrolled_loop () const
964 if (!m_unrolled_vls_stmts
)
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE
, vect_location
,
970 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
971 m_unrolled_vls_stmts
);
973 /* The balance here is tricky. On the one hand, we can't be sure whether
974 the code is vectorizable with VLS or not. However, even if
975 it isn't vectorizable with VLS, there's a possibility that
976 the scalar code could also be unrolled. Some of the code might then
977 benefit from SLP, or from using LDP and STP. We therefore apply
978 the heuristic regardless of can_use_vls_p. */
979 return (m_unrolled_vls_stmts
980 && (m_unrolled_vls_stmts
981 <= (unsigned int) param_max_completely_peeled_insns
));
985 costs::better_main_loop_than_p (const vector_costs
*uncast_other
) const
987 auto other
= static_cast<const costs
*> (uncast_other
);
988 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
989 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
991 if (dump_enabled_p ())
992 dump_printf_loc (MSG_NOTE
, vect_location
,
993 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
994 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
995 vect_vf_for_cost (this_loop_vinfo
),
996 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
997 vect_vf_for_cost (other_loop_vinfo
));
999 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
1000 if (bool (m_unrolled_vls_stmts
) != bool (other
->m_unrolled_vls_stmts
)
1001 && m_cost_type
!= other
->m_cost_type
)
1003 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
1004 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
1005 if (this_prefer_unrolled
!= other_prefer_unrolled
)
1007 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_NOTE
, vect_location
,
1009 "Preferring VLS loop because"
1010 " it can be unrolled\n");
1011 return other_prefer_unrolled
;
1014 else if (rvv_max_lmul
== RVV_DYNAMIC
)
1016 if (other
->m_has_unexpected_spills_p
)
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_NOTE
, vect_location
,
1020 "Preferring smaller LMUL loop because"
1021 " it has unexpected spills\n");
1024 else if (riscv_v_ext_vector_mode_p (other_loop_vinfo
->vector_mode
))
1026 if (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo
))
1028 if (maybe_gt (LOOP_VINFO_INT_NITERS (this_loop_vinfo
),
1029 LOOP_VINFO_VECT_FACTOR (this_loop_vinfo
)))
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE
, vect_location
,
1033 "Keep current LMUL loop because"
1034 " known NITERS exceed the new VF\n");
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_NOTE
, vect_location
,
1042 "Keep current LMUL loop because"
1043 " it is unknown NITERS\n");
1048 /* If NITERS is unknown, we should not use VLS modes to vectorize
1049 the loop since we don't support partial vectors for VLS modes,
1050 that is, we will have full vectors (VLSmodes) on loop body
1051 and partial vectors (VLAmodes) on loop epilogue which is very
1052 inefficient. Instead, we should apply partial vectors (VLAmodes)
1053 on loop body without an epilogue on unknown NITERS loop. */
1054 else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo
)
1055 && m_cost_type
== VLS_VECTOR_COST
)
1058 return vector_costs::better_main_loop_than_p (other
);
1061 /* Returns the group size i.e. the number of vectors to be loaded by a
1062 segmented load/store instruction. Return 0 if it is no segmented
1065 segment_loadstore_group_size (enum vect_cost_for_stmt kind
,
1066 stmt_vec_info stmt_info
)
1069 && (kind
== vector_load
|| kind
== vector_store
)
1070 && STMT_VINFO_DATA_REF (stmt_info
))
1072 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1074 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_LOAD_STORE_LANES
)
1075 return DR_GROUP_SIZE (stmt_info
);
1080 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
1081 For some statement, we would like to further fine-grain tweak the cost on
1082 top of riscv_builtin_vectorization_cost handling which doesn't have any
1083 information on statement operation codes etc. */
1086 costs::adjust_stmt_cost (enum vect_cost_for_stmt kind
, loop_vec_info loop
,
1087 stmt_vec_info stmt_info
,
1088 slp_tree
, tree vectype
, int stmt_cost
)
1090 const cpu_vector_cost
*costs
= get_vector_costs ();
1094 stmt_cost
+= (FLOAT_TYPE_P (vectype
) ? costs
->regmove
->FR2VR
1095 : costs
->regmove
->GR2VR
);
1098 stmt_cost
+= (FLOAT_TYPE_P (vectype
) ? costs
->regmove
->VR2FR
1099 : costs
->regmove
->VR2GR
);
1104 if (stmt_info
&& stmt_info
->stmt
&& STMT_VINFO_DATA_REF (stmt_info
))
1106 /* Segment loads and stores. When the group size is > 1
1107 the vectorizer will add a vector load/store statement for
1108 each vector in the group. Here we additionally add permute
1110 /* TODO: Indexed and ordered/unordered cost. */
1111 int group_size
= segment_loadstore_group_size (kind
, stmt_info
);
1117 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1118 stmt_cost
+= costs
->vla
->segment_permute_2
;
1120 stmt_cost
+= costs
->vls
->segment_permute_2
;
1123 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1124 stmt_cost
+= costs
->vla
->segment_permute_3
;
1126 stmt_cost
+= costs
->vls
->segment_permute_3
;
1129 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1130 stmt_cost
+= costs
->vla
->segment_permute_4
;
1132 stmt_cost
+= costs
->vls
->segment_permute_4
;
1135 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1136 stmt_cost
+= costs
->vla
->segment_permute_5
;
1138 stmt_cost
+= costs
->vls
->segment_permute_5
;
1141 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1142 stmt_cost
+= costs
->vla
->segment_permute_6
;
1144 stmt_cost
+= costs
->vls
->segment_permute_6
;
1147 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1148 stmt_cost
+= costs
->vla
->segment_permute_7
;
1150 stmt_cost
+= costs
->vls
->segment_permute_7
;
1153 if (riscv_v_ext_vector_mode_p (loop
->vector_mode
))
1154 stmt_cost
+= costs
->vla
->segment_permute_8
;
1156 stmt_cost
+= costs
->vls
->segment_permute_8
;
1164 /* Unit-stride vector loads and stores do not have offset
1165 addressing as opposed to scalar loads and stores.
1166 If the address depends on a variable we need an additional
1167 add/sub for each load/store in the worst case. */
1168 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
1169 class loop
*father
= stmt_info
->stmt
->bb
->loop_father
;
1170 if (!loop
&& father
&& !father
->inner
&& father
->superloops
)
1173 if (TREE_CODE (dr
->ref
) != MEM_REF
1174 || !(ref
= TREE_OPERAND (dr
->ref
, 0))
1175 || TREE_CODE (ref
) != SSA_NAME
)
1178 if (SSA_NAME_IS_DEFAULT_DEF (ref
))
1181 if (memrefs
.contains ({ref
, cst0
}))
1184 memrefs
.add ({ref
, cst0
});
1186 /* In case we have not seen REF before and the base
1187 address is a pointer operation try a bit harder. */
1188 tree base
= DR_BASE_ADDRESS (dr
);
1189 if (TREE_CODE (base
) == POINTER_PLUS_EXPR
1190 || TREE_CODE (base
) == POINTER_DIFF_EXPR
)
1192 /* Deconstruct BASE's first operand. If it is a
1193 binary operation, i.e. a base and an "offset"
1194 store this pair. Only increase the stmt_cost if
1195 we haven't seen it before. */
1196 tree argp
= TREE_OPERAND (base
, 1);
1197 typedef std::pair
<tree
, tree
> addr_pair
;
1199 if (TREE_CODE_CLASS (TREE_CODE (argp
)) == tcc_binary
)
1201 tree argp0
= tree_strip_nop_conversions
1202 (TREE_OPERAND (argp
, 0));
1203 tree argp1
= TREE_OPERAND (argp
, 1);
1204 pair
= addr_pair (argp0
, argp1
);
1205 if (memrefs
.contains (pair
))
1210 += builtin_vectorization_cost (scalar_stmt
,
1227 costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
1228 stmt_vec_info stmt_info
, slp_tree node
, tree vectype
,
1229 int misalign
, vect_cost_model_location where
)
1232 = targetm
.vectorize
.builtin_vectorization_cost (kind
, vectype
, misalign
);
1234 /* Do one-time initialization based on the vinfo. */
1235 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
1236 if (!m_analyzed_vinfo
)
1239 analyze_loop_vinfo (loop_vinfo
);
1242 m_analyzed_vinfo
= true;
1247 /* If we're applying the VLA vs. VLS unrolling heuristic,
1248 estimate the number of statements in the unrolled VLS
1249 loop. For simplicity, we assume that one iteration of the
1250 VLS loop would need the same number of statements
1251 as one iteration of the VLA loop. */
1252 if (where
== vect_body
&& m_unrolled_vls_niters
)
1253 m_unrolled_vls_stmts
+= count
* m_unrolled_vls_niters
;
1257 stmt_cost
= adjust_stmt_cost (kind
, loop_vinfo
, stmt_info
, node
, vectype
,
1260 return record_stmt_cost (stmt_info
, where
, count
* stmt_cost
);
1263 /* For some target specific vectorization cost which can't be handled per stmt,
1264 we check the requisite conditions and adjust the vectorization cost
1265 accordingly if satisfied. One typical example is to model and adjust
1266 loop_len cost for known_lt (NITERS, VF). */
1269 costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo
)
1271 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
)
1272 && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo
))
1274 /* In middle-end loop vectorizer, we don't count the loop_len cost in
1275 vect_estimate_min_profitable_iters when NITERS < VF, that is, we only
1276 count cost of len that we need to iterate loop more than once with VF.
1277 It's correct for most of the cases:
1280 for (int i = 0; i < 3; i ++)
1283 We don't need to cost MIN_EXPR or SELECT_VL for the case above.
1285 However, for some inefficient vectorized cases, it does use MIN_EXPR
1288 E.g. VF = [256, 256]
1291 # loop_len_110 = PHI <18(2), _119(11)>
1293 _117 = MIN_EXPR <ivtmp_114, 18>;
1295 _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>;
1300 _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111);
1302 We cost 1 unconditionally for this situation like other targets which
1303 apply mask as the loop control. */
1304 rgroup_controls
*rgc
;
1305 unsigned int num_vectors_m1
;
1306 unsigned int body_stmts
= 0;
1307 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
1309 body_stmts
+= num_vectors_m1
+ 1;
1311 add_stmt_cost (body_stmts
, scalar_stmt
, NULL
, NULL
, NULL_TREE
, 0,
1317 costs::finish_cost (const vector_costs
*scalar_costs
)
1319 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
))
1321 adjust_vect_cost_per_loop (loop_vinfo
);
1323 vector_costs::finish_cost (scalar_costs
);
1326 } // namespace riscv_vector