[PR testsuite/116860] Testsuite adjustment for recently added tests
[official-gcc.git] / gcc / config / riscv / riscv-avlprop.cc
blobbb4aceb75064bb6e45d76cfec078a37d41731a4e
1 /* AVL propagation pass for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2025 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or(at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 /* Pre-RA RTL_SSA-based pass propagates AVL for RVV instructions.
22 A standalone AVL propagation pass is designed because:
24 - Better code maintain:
25 Current LCM-based VSETVL pass is so complicated that codes
26 there will become even harder to maintain. A straight forward
27 AVL propagation PASS is much easier to maintain.
29 - Reduce scalar register pressure:
30 A type of AVL propagation is we propagate AVL from NON-VLMAX
31 instruction to VLMAX instruction.
32 Note: VLMAX instruction should be ignore tail elements (TA)
33 and the result should be used by the NON-VLMAX instruction.
34 This optimization is mostly for auto-vectorization codes:
36 vsetvli r136, r137 --- SELECT_VL
37 vle8.v (use avl = r136) --- IFN_MASK_LEN_LOAD
38 vadd.vv (use VLMAX) --- PLUS_EXPR
39 vse8.v (use avl = r136) --- IFN_MASK_LEN_STORE
41 NO AVL propagation:
43 vsetvli a5, a4, ta
44 vle8.v v1
45 vsetvli t0, zero, ta
46 vadd.vv v2, v1, v1
47 vse8.v v2
49 We can propagate the AVL to 'vadd.vv' since its result
50 is consumed by a 'vse8.v' which has AVL = a5 and its
51 tail elements are agnostic.
53 We DON'T do this optimization on VSETVL pass since it is a
54 post-RA pass that consumed 't0' already wheras a standalone
55 pre-RA AVL propagation pass allows us elide the consumption
56 of the pseudo register of 't0' then we can reduce scalar
57 register pressure.
59 - More AVL propagation opportunities:
60 A pre-RA pass is more flexible for AVL REG def-use chain,
61 thus we will get more potential AVL propagation as long as
62 it doesn't increase the scalar register pressure.
65 #define IN_TARGET_CODE 1
66 #define INCLUDE_ALGORITHM
67 #define INCLUDE_FUNCTIONAL
68 #define INCLUDE_ARRAY
70 #include "config.h"
71 #include "system.h"
72 #include "coretypes.h"
73 #include "tm.h"
74 #include "backend.h"
75 #include "rtl.h"
76 #include "target.h"
77 #include "tree-pass.h"
78 #include "df.h"
79 #include "rtl-ssa.h"
80 #include "cfgcleanup.h"
81 #include "insn-attr.h"
82 #include "tm-constrs.h"
83 #include "insn-opinit.h"
85 using namespace rtl_ssa;
86 using namespace riscv_vector;
88 enum avlprop_type
90 /* VLMAX AVL and tail agnostic candidates. */
91 AVLPROP_VLMAX_TA,
92 AVLPROP_NONE
95 /* dump helper functions */
96 static const char *
97 avlprop_type_to_str (enum avlprop_type type)
99 switch (type)
101 case AVLPROP_VLMAX_TA:
102 return "vlmax_ta";
104 default:
105 gcc_unreachable ();
109 /* Return true if the AVL of the INSN can be propagated. */
110 static bool
111 avl_can_be_propagated_p (rtx_insn *rinsn)
113 /* We can't do AVL propagation when the instruction is potentially
114 touching the element with i > AVL. So, we don't do AVL propagation
115 on these following situations:
117 vgather:
118 - The index of "vrgather dest, source, index" may pick up the
119 element which has index >= AVL, so we can't strip the elements
120 that has index >= AVL of source register.
121 vslide1down:
122 - The last element of vslide1down is AVL + 1 according to RVV ISA:
123 vstart <= i < vl-1 vd[i] = vs2[i+1] if v0.mask[i] enabled
124 - The last multiple elements of vslidedown can be the element
125 has index >= AVL according to RVV ISA:
126 0 <= i+OFFSET < VLMAX src[i] = vs2[i+OFFSET]
127 vstart <= i < vl vd[i] = src[i] if v0.mask[i] enabled.
128 vcompress:
129 - According to the ISA, the first vl elements of vector register
130 group vs2 should be extracted and packed for vcompress. And the
131 highest element of vs2 vector may be touched by the mask. For
132 example, given vlmax = 4 here.
133 v0 = 0b1000
134 v1 = {0x1, 0x2, 0x3, 0x4}
135 v2 = {0x5, 0x6, 0x7, 0x8}
136 vcompress v1, v2, v0 with avl = 4, v1 = {0x8, 0x2, 0x3, 0x4}.
137 vcompress v1, v2, v0 with avl = 2, v1 will be unchanged.
138 Thus, we cannot propagate avl of vcompress because it may has
139 semantics change to the result. */
140 return get_attr_type (rinsn) != TYPE_VGATHER
141 && get_attr_type (rinsn) != TYPE_VSLIDEDOWN
142 && get_attr_type (rinsn) != TYPE_VISLIDE1DOWN
143 && get_attr_type (rinsn) != TYPE_VFSLIDE1DOWN
144 && get_attr_type (rinsn) != TYPE_VCOMPRESS;
147 static bool
148 vlmax_ta_p (rtx_insn *rinsn)
150 return vlmax_avl_type_p (rinsn) && tail_agnostic_p (rinsn);
153 static machine_mode
154 get_insn_vtype_mode (rtx_insn *rinsn)
156 extract_insn_cached (rinsn);
157 int mode_idx = get_attr_mode_idx (rinsn);
158 gcc_assert (mode_idx != INVALID_ATTRIBUTE);
159 return GET_MODE (recog_data.operand[mode_idx]);
162 /* Return new pattern for AVL propagation.
163 Normally, we just replace AVL operand only for most
164 of the instructions. However, for instructions like
165 fault load which use AVL TYPE twice in the pattern which
166 will cause ICE in the later AVL TYPE change so we regenerate
167 the whole pattern for such instructions. */
168 static rtx
169 simplify_replace_avl (rtx_insn *rinsn, rtx new_avl)
171 /* Replace AVL operand. */
172 extract_insn_cached (rinsn);
173 rtx avl = recog_data.operand[get_attr_vl_op_idx (rinsn)];
174 int count = count_regno_occurrences (rinsn, REGNO (avl));
175 gcc_assert (count == 1);
176 rtx new_pat = simplify_replace_rtx (PATTERN (rinsn), avl, new_avl);
177 if (get_attr_type (rinsn) == TYPE_VLDFF
178 || get_attr_type (rinsn) == TYPE_VLSEGDFF)
179 new_pat
180 = gen_pred_fault_load (recog_data.operand_mode[0], recog_data.operand[0],
181 recog_data.operand[1], recog_data.operand[2],
182 recog_data.operand[3], new_avl,
183 recog_data.operand[5], recog_data.operand[6],
184 get_avl_type_rtx (avl_type::NONVLMAX));
185 else
186 new_pat = simplify_replace_rtx (PATTERN (rinsn), avl, new_avl);
187 return new_pat;
190 static void
191 simplify_replace_vlmax_avl (rtx_insn *rinsn, rtx new_avl)
193 if (dump_file && (dump_flags & TDF_DETAILS))
195 fprintf (dump_file, "\nPropagating AVL: ");
196 print_rtl_single (dump_file, new_avl);
197 fprintf (dump_file, "into: ");
198 print_rtl_single (dump_file, rinsn);
200 rtx new_pat = simplify_replace_avl (rinsn, new_avl);
201 validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, false);
203 /* Change AVL TYPE into NONVLMAX if it is VLMAX. */
204 if (vlmax_avl_type_p (rinsn))
206 int index = get_attr_avl_type_idx (rinsn);
207 gcc_assert (index != INVALID_ATTRIBUTE);
208 validate_change_or_fail (rinsn, recog_data.operand_loc[index],
209 get_avl_type_rtx (avl_type::NONVLMAX), false);
211 if (dump_file && (dump_flags & TDF_DETAILS))
213 fprintf (dump_file, "Successfully to match this instruction: ");
214 print_rtl_single (dump_file, rinsn);
218 const pass_data pass_data_avlprop = {
219 RTL_PASS, /* type */
220 "avlprop", /* name */
221 OPTGROUP_NONE, /* optinfo_flags */
222 TV_NONE, /* tv_id */
223 0, /* properties_required */
224 0, /* properties_provided */
225 0, /* properties_destroyed */
226 0, /* todo_flags_start */
227 0, /* todo_flags_finish */
230 class pass_avlprop : public rtl_opt_pass
232 public:
233 pass_avlprop (gcc::context *ctxt) : rtl_opt_pass (pass_data_avlprop, ctxt) {}
235 /* opt_pass methods: */
236 virtual bool gate (function *) final override
238 return TARGET_VECTOR && optimize > 0;
240 virtual unsigned int execute (function *) final override;
242 private:
243 /* The AVL propagation instructions and corresponding preferred AVL.
244 It will be updated during the analysis. */
245 hash_map<insn_info *, rtx> *m_avl_propagations;
247 /* Potential feasible AVL propagation candidates. */
248 auto_vec<std::pair<enum avlprop_type, insn_info *>> m_candidates;
250 rtx get_preferred_avl (const std::pair<enum avlprop_type, insn_info *>) const;
251 rtx get_vlmax_ta_preferred_avl (insn_info *) const;
252 rtx get_nonvlmax_avl (insn_info *) const;
254 void avlprop_init (function *);
255 void avlprop_done (void);
256 }; // class pass_avlprop
258 void
259 pass_avlprop::avlprop_init (function *fn)
261 calculate_dominance_info (CDI_DOMINATORS);
262 df_analyze ();
263 crtl->ssa = new function_info (fn);
264 m_avl_propagations = new hash_map<insn_info *, rtx>;
267 void
268 pass_avlprop::avlprop_done (void)
270 free_dominance_info (CDI_DOMINATORS);
271 if (crtl->ssa->perform_pending_updates ())
272 cleanup_cfg (0);
273 delete crtl->ssa;
274 crtl->ssa = nullptr;
275 delete m_avl_propagations;
276 m_avl_propagations = NULL;
277 if (!m_candidates.is_empty ())
278 m_candidates.release ();
281 /* If we have a preferred AVL to propagate, return the AVL.
282 Otherwise, return NULL_RTX as we don't need have any preferred
283 AVL. */
286 pass_avlprop::get_preferred_avl (
287 const std::pair<enum avlprop_type, insn_info *> candidate) const
289 switch (candidate.first)
291 case AVLPROP_VLMAX_TA:
292 return get_vlmax_ta_preferred_avl (candidate.second);
293 default:
294 gcc_unreachable ();
296 return NULL_RTX;
299 /* This is a straight forward pattern ALWAYS in partial auto-vectorization:
301 VL = SELECT_AVL (AVL, ...)
302 V0 = MASK_LEN_LOAD (..., VL)
303 V1 = MASK_LEN_LOAD (..., VL)
304 V2 = V0 + V1 --- Missed LEN information.
305 MASK_LEN_STORE (..., V2, VL)
307 We prefer PLUS_EXPR (V0 + V1) instead of COND_LEN_ADD (V0, V1, dummy LEN)
308 because:
310 - Few code changes in Loop Vectorizer.
311 - Reuse the current clean flow of partial vectorization, That is, apply
312 predicate LEN or MASK into LOAD/STORE operations and other special
313 arithmetic operations (e.d. DIV), then do the whole vector register
314 operation if it DON'T affect the correctness.
315 Such flow is used by all other targets like x86, sve, s390, ... etc.
316 - PLUS_EXPR has better gimple optimizations than COND_LEN_ADD.
318 We propagate AVL from NON-VLMAX to VLMAX for gimple IR like PLUS_EXPR which
319 generates the VLMAX instruction due to missed LEN information. The later
320 VSETVL PASS will elided the redundant vsetvls.
324 pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
326 if (!avl_can_be_propagated_p (insn->rtl ()))
327 return NULL_RTX;
328 int sew = get_sew (insn->rtl ());
329 enum vlmul_type vlmul = get_vlmul (insn->rtl ());
330 int ratio = calculate_ratio (sew, vlmul);
332 rtx use_avl = NULL_RTX;
333 for (def_info *def : insn->defs ())
335 if (!is_a<set_info *> (def) || def->is_mem ())
336 return NULL_RTX;
337 const auto *set = dyn_cast<set_info *> (def);
339 /* FIXME: Stop AVL propagation if any USE is not a RVV real
340 instruction. It should be totally enough for vectorized codes since
341 they always locate at extended blocks.
343 TODO: We can extend PHI checking for intrinsic codes if it
344 necessary in the future. */
345 if (!set->is_local_to_ebb ())
346 return NULL_RTX;
348 for (use_info *use : set->nondebug_insn_uses ())
350 insn_info *use_insn = use->insn ();
351 if (!use_insn->can_be_optimized () || use_insn->is_asm ()
352 || use_insn->is_call () || use_insn->has_volatile_refs ()
353 || use_insn->has_pre_post_modify ()
354 || !has_vl_op (use_insn->rtl ())
355 || !avl_can_be_propagated_p (use_insn->rtl ()))
356 return NULL_RTX;
358 /* We should only propagate non-VLMAX AVL into VLMAX insn when
359 such insn potential tail elements (after propagation) are
360 not used. So, we should make sure the outcome of VLMAX insn
361 is not depend on. */
362 extract_insn_cached (use_insn->rtl ());
363 int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ());
364 if (merge_op_idx != INVALID_ATTRIBUTE
365 && !satisfies_constraint_vu (recog_data.operand[merge_op_idx])
366 && refers_to_regno_p (set->regno (),
367 recog_data.operand[merge_op_idx])
368 && !tail_agnostic_p (use_insn->rtl ()))
369 return NULL_RTX;
371 int new_sew = get_sew (use_insn->rtl ());
372 enum vlmul_type new_vlmul = get_vlmul (use_insn->rtl ());
373 int new_ratio = calculate_ratio (new_sew, new_vlmul);
374 if (new_ratio != ratio)
375 return NULL_RTX;
377 rtx new_use_avl = get_nonvlmax_avl (use_insn);
378 if (!new_use_avl || SUBREG_P (new_use_avl))
379 return NULL_RTX;
380 if (REG_P (new_use_avl))
382 resource_info resource = full_register (REGNO (new_use_avl));
383 def_lookup dl = crtl->ssa->find_def (resource, use_insn);
384 if (dl.matching_set ())
385 return NULL_RTX;
386 def_info *def1 = dl.prev_def (insn);
387 def_info *def2 = dl.prev_def (use_insn);
388 if (!def1 || !def2 || def1 != def2)
389 return NULL_RTX;
390 /* For vectorized codes, we always use SELECT_VL/MIN_EXPR to
391 calculate the loop len at the header of the loop.
392 We only allow AVL propagation for real instruction for now.
393 TODO: We may enhance it for intrinsic codes if it is necessary.
395 if (!def1->insn ()->is_real ())
396 return NULL_RTX;
398 /* FIXME: We only all AVL propagation within a block which should
399 be totally enough for vectorized codes.
401 TODO: We can enhance it here for intrinsic codes in the future
402 if it is necessary. */
403 if (def1->insn ()->bb () != insn->bb ()
404 && !dominated_by_p (CDI_DOMINATORS, insn->bb ()->cfg_bb (),
405 def1->insn ()->bb ()->cfg_bb ()))
406 return NULL_RTX;
407 if (def1->insn ()->bb () == insn->bb ()
408 && def1->insn ()->compare_with (insn) >= 0)
409 return NULL_RTX;
412 if (!use_avl)
413 use_avl = new_use_avl;
414 else if (!rtx_equal_p (use_avl, new_use_avl))
415 return NULL_RTX;
419 return use_avl;
422 /* Try to get the NONVLMAX AVL of the INSN.
423 INSN can be either NON-VLMAX AVL itself or VLMAX AVL INSN
424 before the PASS but has been propagated a NON-VLMAX AVL
425 in the before round propagation. */
427 pass_avlprop::get_nonvlmax_avl (insn_info *insn) const
429 if (m_avl_propagations->get (insn))
430 return (*m_avl_propagations->get (insn));
431 else if (nonvlmax_avl_type_p (insn->rtl ()))
433 extract_insn_cached (insn->rtl ());
434 return recog_data.operand[get_attr_vl_op_idx (insn->rtl ())];
437 return NULL_RTX;
440 /* Main entry point for this pass. */
441 unsigned int
442 pass_avlprop::execute (function *fn)
444 avlprop_init (fn);
446 /* Iterate the whole function in reverse order (which could speed the
447 convergence) to collect all potential candidates that could be AVL
448 propagated.
450 Note that: **NOT** all the candidates will be successfully AVL propagated.
452 for (bb_info *bb : crtl->ssa->reverse_bbs ())
454 for (insn_info *insn : bb->reverse_real_nondebug_insns ())
456 /* We only forward AVL to the instruction that has AVL/VL operand
457 and can be optimized in RTL_SSA level. */
458 if (!insn->can_be_optimized () || !has_vl_op (insn->rtl ()))
459 continue;
461 /* TODO: We only do AVL propagation for VLMAX AVL with tail
462 agnostic policy since we have missed-LEN information partial
463 autovectorization. We could add more AVL propagation
464 for intrinsic codes in the future. */
465 if (vlmax_ta_p (insn->rtl ()))
466 m_candidates.safe_push (std::make_pair (AVLPROP_VLMAX_TA, insn));
470 if (dump_file && (dump_flags & TDF_DETAILS))
472 fprintf (dump_file, "\nNumber of potential AVL propagations: %d\n",
473 m_candidates.length ());
474 for (const auto &candidate : m_candidates)
476 fprintf (dump_file, "\nAVL propagation type: %s\n",
477 avlprop_type_to_str (candidate.first));
478 print_rtl_single (dump_file, candidate.second->rtl ());
482 /* Go through all the candidates looking for AVL that we could propagate. */
483 bool change_p = true;
484 while (change_p)
486 change_p = false;
487 for (auto &candidate : m_candidates)
489 rtx new_avl = get_preferred_avl (candidate);
490 if (new_avl)
492 gcc_assert (!vlmax_avl_p (new_avl));
493 auto &update
494 = m_avl_propagations->get_or_insert (candidate.second);
495 change_p = !rtx_equal_p (update, new_avl);
496 update = new_avl;
501 if (dump_file && (dump_flags & TDF_DETAILS))
502 fprintf (dump_file, "\nNumber of successful AVL propagations: %d\n\n",
503 (int) m_avl_propagations->elements ());
505 for (const auto prop : *m_avl_propagations)
507 rtx_insn *rinsn = prop.first->rtl ();
508 simplify_replace_vlmax_avl (rinsn, prop.second);
511 if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
513 /* Simplify VLMAX AVL into immediate AVL.
514 E.g. Simplify this following case:
516 vsetvl a5, zero, e32, m1
517 vadd.vv
519 into:
521 vsetvl zero, 4, e32, m1
522 vadd.vv
523 if GET_MODE_NUNITS (RVVM1SImode) == 4. */
524 if (dump_file && (dump_flags & TDF_DETAILS))
525 fprintf (dump_file, "\nSimplifying VLMAX AVL into IMM AVL\n\n");
526 for (auto &candidate : m_candidates)
528 rtx_insn *rinsn = candidate.second->rtl ();
529 machine_mode vtype_mode = get_insn_vtype_mode (rinsn);
530 if (candidate.first == AVLPROP_VLMAX_TA
531 && !m_avl_propagations->get (candidate.second)
532 && imm_avl_p (vtype_mode))
534 rtx new_avl = gen_int_mode (GET_MODE_NUNITS (vtype_mode), Pmode);
535 simplify_replace_vlmax_avl (rinsn, new_avl);
540 avlprop_done ();
541 return 0;
544 rtl_opt_pass *
545 make_pass_avlprop (gcc::context *ctxt)
547 return new pass_avlprop (ctxt);