[PATCH] RISC-V: Move UNSPEC_SSP_SET and UNSPEC_SSP_TEST to correct enum
[gcc.git] / gcc / gimple-loop-jam.cc
blob5e6c04a7d7f2e15a9d12a54b56bed9997e532103
1 /* Loop unroll-and-jam.
2 Copyright (C) 2017-2025 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tree-pass.h"
24 #include "backend.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "ssa.h"
28 #include "fold-const.h"
29 #include "tree-cfg.h"
30 #include "tree-ssa.h"
31 #include "tree-ssa-loop-niter.h"
32 #include "tree-ssa-loop.h"
33 #include "tree-ssa-loop-manip.h"
34 #include "cfgloop.h"
35 #include "tree-scalar-evolution.h"
36 #include "gimple-iterator.h"
37 #include "cfghooks.h"
38 #include "tree-data-ref.h"
39 #include "tree-ssa-loop-ivopts.h"
40 #include "tree-vectorizer.h"
41 #include "tree-ssa-sccvn.h"
42 #include "tree-cfgcleanup.h"
44 /* Unroll and Jam transformation
46 This is a combination of two transformations, where the second
47 is not always valid. It's applicable if a loop nest has redundancies
48 over the iterations of an outer loop while not having that with
49 an inner loop.
51 Given this nest:
52 for (i) {
53 for (j) {
54 B(i,j)
58 first unroll:
59 for (i by 2) {
60 for (j) {
61 B(i,j)
63 for (j) {
64 B(i+1,j)
68 then fuse the two adjacent inner loops resulting from that:
69 for (i by 2) {
70 for (j) {
71 B(i,j)
72 B(i+1,j)
76 As the order of evaluations of the body B changes this is valid
77 only in certain situations: all distance vectors need to be forward.
78 Additionally if there are multiple induction variables than just
79 a counting control IV (j above) we can also deal with some situations.
81 The validity is checked by unroll_jam_possible_p, and the data-dep
82 testing below.
84 A trivial example where the fusion is wrong would be when
85 B(i,j) == x[j-1] = x[j];
86 for (i by 2) {
87 for (j) {
88 x[j-1] = x[j];
90 for (j) {
91 x[j-1] = x[j];
93 } effect: move content to front by two elements
94 -->
95 for (i by 2) {
96 for (j) {
97 x[j-1] = x[j];
98 x[j-1] = x[j];
100 } effect: move content to front by one element
103 /* Modify the loop tree for the fact that all code once belonging
104 to the OLD loop or the outer loop of OLD now is inside LOOP. */
106 static void
107 merge_loop_tree (class loop *loop, class loop *old)
109 basic_block *bbs;
110 int i, n;
111 class loop *subloop;
112 edge e;
113 edge_iterator ei;
115 /* Find its nodes. */
116 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
117 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
119 for (i = 0; i < n; i++)
121 /* If the block was direct child of OLD loop it's now part
122 of LOOP. If it was outside OLD, then it moved into LOOP
123 as well. This avoids changing the loop father for BBs
124 in inner loops of OLD. */
125 if (bbs[i]->loop_father == old
126 || loop_depth (bbs[i]->loop_father) < loop_depth (old))
128 remove_bb_from_loops (bbs[i]);
129 add_bb_to_loop (bbs[i], loop);
130 continue;
133 /* If we find a direct subloop of OLD, move it to LOOP. */
134 subloop = bbs[i]->loop_father;
135 if (loop_outer (subloop) == old && subloop->header == bbs[i])
137 flow_loop_tree_node_remove (subloop);
138 flow_loop_tree_node_add (loop, subloop);
142 /* Update the information about loop exit edges. */
143 for (i = 0; i < n; i++)
145 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
147 rescan_loop_exit (e, false, false);
151 loop->num_nodes = n;
153 free (bbs);
156 /* BB is part of the outer loop of an unroll-and-jam situation.
157 Check if any statements therein would prevent the transformation. */
159 static bool
160 bb_prevents_fusion_p (basic_block bb)
162 gimple_stmt_iterator gsi;
163 /* BB is duplicated by outer unrolling and then all N-1 first copies
164 move into the body of the fused inner loop. If BB exits the outer loop
165 the last copy still does so, and the first N-1 copies are cancelled
166 by loop unrolling, so also after fusion it's the exit block.
167 But there might be other reasons that prevent fusion:
168 * stores or unknown side-effects prevent fusion
169 * loads don't
170 * computations into SSA names: these aren't problematic. Their
171 result will be unused on the exit edges of the first N-1 copies
172 (those aren't taken after unrolling). If they are used on the
173 other edge (the one leading to the outer latch block) they are
174 loop-carried (on the outer loop) and the Nth copy of BB will
175 compute them again (i.e. the first N-1 copies will be dead). */
176 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
178 gimple *g = gsi_stmt (gsi);
179 if (gimple_vdef (g) || gimple_has_side_effects (g))
180 return true;
182 return false;
185 /* Given an inner loop LOOP (of some OUTER loop) determine if
186 we can safely fuse copies of it (generated by outer unrolling).
187 If so return true, otherwise return false. */
189 static bool
190 unroll_jam_possible_p (class loop *outer, class loop *loop)
192 basic_block *bbs;
193 int i, n;
194 class tree_niter_desc niter;
196 /* When fusing the loops we skip the latch block
197 of the first one, so it mustn't have any effects to
198 preserve. */
199 if (!empty_block_p (loop->latch))
200 return false;
202 edge exit;
203 if (!(exit = single_exit (loop)))
204 return false;
206 /* We need a perfect nest. Quick check for adjacent inner loops. */
207 if (outer->inner != loop || loop->next)
208 return false;
210 /* Prevent head-controlled inner loops, that we usually have.
211 The guard block would need to be accepted
212 (invariant condition either entering or skipping the loop),
213 without also accepting arbitrary control flow. When unswitching
214 ran before us (as with -O3) this won't be a problem because its
215 outer loop unswitching will have moved out the invariant condition.
217 If we do that we need to extend fuse_loops() to cope with this
218 by threading through the (still invariant) copied condition
219 between the two loop copies. */
220 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
221 return false;
223 /* The number of iterations of the inner loop must be loop invariant
224 with respect to the outer loop. */
225 if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
226 false, true)
227 || niter.cmp == ERROR_MARK
228 || !integer_zerop (niter.may_be_zero)
229 || !expr_invariant_in_loop_p (outer, niter.niter))
230 return false;
232 /* If the inner loop produces any values that are used inside the
233 outer loop (except the virtual op) then it can flow
234 back (perhaps indirectly) into the inner loop. This prevents
235 fusion: without fusion the value at the last iteration is used,
236 with fusion the value after the initial iteration is used.
238 If all uses are outside the outer loop this doesn't prevent fusion;
239 the value of the last iteration is still used (and the values from
240 all intermediate iterations are dead). */
241 gphi_iterator psi;
242 for (psi = gsi_start_phis (single_exit (loop)->dest);
243 !gsi_end_p (psi); gsi_next (&psi))
245 imm_use_iterator imm_iter;
246 use_operand_p use_p;
247 tree op = gimple_phi_result (psi.phi ());
248 if (virtual_operand_p (op))
249 continue;
250 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
252 gimple *use_stmt = USE_STMT (use_p);
253 if (!is_gimple_debug (use_stmt)
254 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
255 return false;
259 /* And check blocks belonging to just outer loop. */
260 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
261 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
263 for (i = 0; i < n; i++)
264 if (bbs[i]->loop_father == outer
265 && (bb_prevents_fusion_p (bbs[i])
266 /* Outer loop exits must come after the inner loop, otherwise
267 we'll put the outer loop exit into the fused inner loop. */
268 || (loop_exits_from_bb_p (outer, bbs[i])
269 && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
270 break;
271 free (bbs);
272 if (i != n)
273 return false;
275 /* For now we can safely fuse copies of LOOP only if all
276 loop carried variables are inductions (or the virtual op).
278 We could handle reductions as well (the initial value in the second
279 body would be the after-iter value of the first body) if it's over
280 an associative and commutative operation. We wouldn't
281 be able to handle unknown cycles. */
282 bool inner_vdef = false;
283 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
285 affine_iv iv;
286 tree op = gimple_phi_result (psi.phi ());
288 if (virtual_operand_p (op))
290 inner_vdef = true;
291 continue;
293 if (!simple_iv (loop, loop, op, &iv, true))
294 return false;
295 /* The inductions must be regular, loop invariant step and initial
296 value. */
297 if (!expr_invariant_in_loop_p (outer, iv.step)
298 || !expr_invariant_in_loop_p (outer, iv.base))
299 return false;
300 /* XXX With more effort we could also be able to deal with inductions
301 where the initial value is loop variant but a simple IV in the
302 outer loop. The initial value for the second body would be
303 the original initial value plus iv.base.step. The next value
304 for the fused loop would be the original next value of the first
305 copy, _not_ the next value of the second body. */
308 /* When there's no inner loop virtual PHI IV we cannot handle the update
309 required to the inner loop if that doesn't already have one. See
310 PR117113. */
311 if (!inner_vdef && get_virtual_phi (outer->header))
312 return false;
314 return true;
317 /* Fuse LOOP with all further neighbors. The loops are expected to
318 be in appropriate form. */
320 static void
321 fuse_loops (class loop *loop)
323 class loop *next = loop->next;
325 while (next)
327 edge e;
329 remove_branch (single_pred_edge (loop->latch));
330 /* Make delete_basic_block not fiddle with the loop structure. */
331 basic_block oldlatch = loop->latch;
332 loop->latch = NULL;
333 delete_basic_block (oldlatch);
334 e = redirect_edge_and_branch (loop_latch_edge (next),
335 loop->header);
336 loop->latch = e->src;
337 flush_pending_stmts (e);
339 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
341 /* The PHI nodes of the second body (single-argument now)
342 need adjustments to use the right values: either directly
343 the value of the corresponding PHI in the first copy or
344 the one leaving the first body which unrolling did for us.
346 See also unroll_jam_possible_p() for further possibilities. */
347 gphi_iterator psi_first, psi_second;
348 e = single_pred_edge (next->header);
349 for (psi_first = gsi_start_phis (loop->header),
350 psi_second = gsi_start_phis (next->header);
351 !gsi_end_p (psi_first);
352 gsi_next (&psi_first), gsi_next (&psi_second))
354 gphi *phi_first = psi_first.phi ();
355 gphi *phi_second = psi_second.phi ();
356 tree firstop = gimple_phi_result (phi_first);
357 /* The virtual operand is correct already as it's
358 always live at exit, hence has a LCSSA node and outer
359 loop unrolling updated SSA form. */
360 if (virtual_operand_p (firstop))
361 continue;
363 /* Due to unroll_jam_possible_p() we know that this is
364 an induction. The second body goes over the same
365 iteration space. */
366 add_phi_arg (phi_second, firstop, e,
367 gimple_location (phi_first));
369 gcc_assert (gsi_end_p (psi_second));
371 merge_loop_tree (loop, next);
372 gcc_assert (!next->num_nodes);
373 class loop *ln = next->next;
374 delete_loop (next);
375 next = ln;
379 /* Return true if any of the access functions for dataref A
380 isn't invariant with respect to loop LOOP_NEST. */
381 static bool
382 any_access_function_variant_p (const struct data_reference *a,
383 const class loop *loop_nest)
385 vec<tree> fns = DR_ACCESS_FNS (a);
387 for (tree t : fns)
388 if (!evolution_function_is_invariant_p (t, loop_nest->num))
389 return true;
391 return false;
394 /* Returns true if the distance in DDR can be determined and adjusts
395 the unroll factor in *UNROLL to make unrolling valid for that distance.
396 Otherwise return false. DDR is with respect to the outer loop of INNER.
398 If this data dep can lead to a removed memory reference, increment
399 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
400 for this to happen. */
402 static bool
403 adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
404 unsigned *unroll, unsigned *profit_unroll,
405 unsigned *removed)
407 bool ret = false;
408 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
410 if (DDR_NUM_DIST_VECTS (ddr) == 0)
411 return false;
412 unsigned i;
413 lambda_vector dist_v;
414 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
416 /* A distance (a,b) is at worst transformed into (a/N,b) by the
417 unrolling (factor N), so the transformation is valid if
418 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
419 factor needs to be limited so that the first condition holds.
420 That may limit the factor down to zero in the worst case. */
421 lambda_int dist = dist_v[0];
422 if (dist < 0)
423 gcc_unreachable ();
424 else if (dist >= (lambda_int)*unroll)
426 else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
428 /* We have (a,0) with a < N, so this will be transformed into
429 (0,0) after unrolling by N. This might potentially be a
430 problem, if it's not a read-read dependency. */
431 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
433 else
435 /* So, at least one is a write, and we might reduce the
436 distance vector to (0,0). This is still no problem
437 if both data-refs are affine with respect to the inner
438 loops. But if one of them is invariant with respect
439 to an inner loop our reordering implicit in loop fusion
440 corrupts the program, as our data dependences don't
441 capture this. E.g. for:
442 for (0 <= i < n)
443 for (0 <= j < m)
444 a[i][0] = a[i+1][0] + 2; // (1)
445 b[i][j] = b[i+1][j] + 2; // (2)
446 the distance vector for both statements is (-1,0),
447 but exchanging the order for (2) is okay, while
448 for (1) it is not. To see this, write out the original
449 accesses (assume m is 2):
450 a i j original
451 0 0 0 r a[1][0] b[1][0]
452 1 0 0 w a[0][0] b[0][0]
453 2 0 1 r a[1][0] b[1][1]
454 3 0 1 w a[0][0] b[0][1]
455 4 1 0 r a[2][0] b[2][0]
456 5 1 0 w a[1][0] b[1][0]
457 after unroll-by-2 and fusion the accesses are done in
458 this order (from column a): 0,1, 4,5, 2,3, i.e. this:
459 a i j transformed
460 0 0 0 r a[1][0] b[1][0]
461 1 0 0 w a[0][0] b[0][0]
462 4 1 0 r a[2][0] b[2][0]
463 5 1 0 w a[1][0] b[1][0]
464 2 0 1 r a[1][0] b[1][1]
465 3 0 1 w a[0][0] b[0][1]
466 Note how access 2 accesses the same element as access 5
467 for array 'a' but not for array 'b'. */
468 if (any_access_function_variant_p (DDR_A (ddr), inner)
469 && any_access_function_variant_p (DDR_B (ddr), inner))
471 else
472 /* And if any dataref of this pair is invariant with
473 respect to the inner loop, we have no chance than
474 to reduce the unroll factor. */
475 *unroll = dist;
478 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
480 else
481 *unroll = dist;
483 /* With a distance (a,0) it's always profitable to unroll-and-jam
484 (by a+1), because one memory reference will go away. With
485 (a,b) and b != 0 that's less clear. We will increase the
486 number of streams without lowering the number of mem refs.
487 So for now only handle the first situation. */
488 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
490 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
491 (*removed)++;
494 ret = true;
497 return ret;
500 /* Main entry point for the unroll-and-jam transformation
501 described above. */
503 static unsigned int
504 tree_loop_unroll_and_jam (void)
506 unsigned int todo = 0;
508 gcc_assert (scev_initialized_p ());
510 /* Go through all innermost loops. */
511 for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
513 class loop *outer = loop_outer (loop);
515 if (loop_depth (loop) < 2
516 || optimize_loop_nest_for_size_p (outer))
517 continue;
519 if (!unroll_jam_possible_p (outer, loop))
520 continue;
522 vec<data_reference_p> datarefs = vNULL;
523 vec<ddr_p> dependences = vNULL;
524 unsigned unroll_factor, profit_unroll, removed;
525 class tree_niter_desc desc;
526 bool unroll = false;
528 auto_vec<loop_p, 3> loop_nest;
529 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
530 &datarefs, &dependences))
532 if (dump_file && (dump_flags & TDF_DETAILS))
533 fprintf (dump_file, "Cannot analyze data dependencies\n");
534 free_data_refs (datarefs);
535 free_dependence_relations (dependences);
536 continue;
538 if (!datarefs.length ())
539 continue;
541 if (dump_file && (dump_flags & TDF_DETAILS))
542 dump_data_dependence_relations (dump_file, dependences);
544 unroll_factor = (unsigned)-1;
545 profit_unroll = 1;
546 removed = 0;
548 /* Check all dependencies. */
549 unsigned i;
550 struct data_dependence_relation *ddr;
551 FOR_EACH_VEC_ELT (dependences, i, ddr)
553 struct data_reference *dra, *drb;
555 /* If the refs are independend there's nothing to do. */
556 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
557 continue;
559 dra = DDR_A (ddr);
560 drb = DDR_B (ddr);
562 /* Nothing interesting for the self dependencies, except for WAW if
563 the access function is not affine or constant because we may end
564 up reordering writes to the same location. */
565 if (dra == drb)
567 if (DR_IS_WRITE (dra)
568 && !DR_ACCESS_FNS (dra).is_empty ()
569 && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
571 unroll_factor = 0;
572 break;
574 else
575 continue;
578 /* Now check the distance vector, for determining a sensible
579 outer unroll factor, and for validity of merging the inner
580 loop copies. */
581 if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
582 &removed))
584 /* Couldn't get the distance vector. For two reads that's
585 harmless (we assume we should unroll). For at least
586 one write this means we can't check the dependence direction
587 and hence can't determine safety. */
589 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
591 unroll_factor = 0;
592 break;
597 /* We regard a user-specified minimum percentage of zero as a request
598 to ignore all profitability concerns and apply the transformation
599 always. */
600 if (!param_unroll_jam_min_percent)
601 profit_unroll = MAX(2, profit_unroll);
602 else if (removed * 100 / datarefs.length ()
603 < (unsigned)param_unroll_jam_min_percent)
604 profit_unroll = 1;
605 if (unroll_factor > profit_unroll)
606 unroll_factor = profit_unroll;
607 if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
608 unroll_factor = param_unroll_jam_max_unroll;
609 unroll = (unroll_factor > 1
610 && can_unroll_loop_p (outer, unroll_factor, &desc));
612 if (unroll)
614 if (dump_enabled_p ())
615 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
616 find_loop_location (outer),
617 "applying unroll and jam with factor %d\n",
618 unroll_factor);
619 initialize_original_copy_tables ();
620 tree_unroll_loop (outer, unroll_factor, &desc);
621 free_original_copy_tables ();
622 fuse_loops (outer->inner);
623 todo |= TODO_cleanup_cfg;
625 auto_bitmap exit_bbs;
626 bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
627 todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
630 loop_nest.release ();
631 free_dependence_relations (dependences);
632 free_data_refs (datarefs);
635 if (todo)
637 free_dominance_info (CDI_DOMINATORS);
638 /* We need to cleanup the CFG first since otherwise SSA form can
639 be not up-to-date from the VN run. */
640 if (todo & TODO_cleanup_cfg)
642 cleanup_tree_cfg ();
643 todo &= ~TODO_cleanup_cfg;
645 rewrite_into_loop_closed_ssa (NULL, 0);
646 scev_reset ();
648 return todo;
651 /* Pass boilerplate */
653 namespace {
655 const pass_data pass_data_loop_jam =
657 GIMPLE_PASS, /* type */
658 "unrolljam", /* name */
659 OPTGROUP_LOOP, /* optinfo_flags */
660 TV_LOOP_JAM, /* tv_id */
661 PROP_cfg, /* properties_required */
662 0, /* properties_provided */
663 0, /* properties_destroyed */
664 0, /* todo_flags_start */
665 0, /* todo_flags_finish */
668 class pass_loop_jam : public gimple_opt_pass
670 public:
671 pass_loop_jam (gcc::context *ctxt)
672 : gimple_opt_pass (pass_data_loop_jam, ctxt)
675 /* opt_pass methods: */
676 bool gate (function *) final override { return flag_unroll_jam != 0; }
677 unsigned int execute (function *) final override;
681 unsigned int
682 pass_loop_jam::execute (function *fun)
684 if (number_of_loops (fun) <= 1)
685 return 0;
687 return tree_loop_unroll_and_jam ();
692 gimple_opt_pass *
693 make_pass_loop_jam (gcc::context *ctxt)
695 return new pass_loop_jam (ctxt);