1 //===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass generates an entirely new schedule tree from the data dependences
10 // and iteration domains. The new schedule tree is computed in two steps:
12 // 1) The isl scheduling optimizer is run
14 // The isl scheduling optimizer creates a new schedule tree that maximizes
15 // parallelism and tileability and minimizes data-dependence distances. The
16 // algorithm used is a modified version of the ``Pluto'' algorithm:
18 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
23 // 2) A set of post-scheduling transformations is applied on the schedule tree.
25 // These optimizations include:
27 // - Tiling of the innermost tilable bands
28 // - Prevectorization - The choice of a possible outer loop that is strip-mined
29 // to the innermost level to enable inner-loop
31 // - Some optimizations for spatial locality are also planned.
33 // For a detailed description of the schedule tree itself please see section 6
36 // Polyhedral AST generation is more than scanning polyhedra
37 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
38 // ACM Transactions on Programming Languages and Systems (TOPLAS),
40 // http://www.grosser.es/#pub-polyhedral-AST-generation
42 // This publication also contains a detailed discussion of the different options
43 // for polyhedral loop unrolling, full/partial tile separation and other uses
44 // of the schedule tree.
46 //===----------------------------------------------------------------------===//
48 #include "polly/ScheduleOptimizer.h"
49 #include "polly/CodeGen/CodeGeneration.h"
50 #include "polly/DependenceInfo.h"
51 #include "polly/ManualOptimizer.h"
52 #include "polly/MatmulOptimizer.h"
53 #include "polly/Options.h"
54 #include "polly/ScheduleTreeTransform.h"
55 #include "polly/Support/ISLOStream.h"
56 #include "polly/Support/ISLTools.h"
57 #include "llvm/ADT/Sequence.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Support/CommandLine.h"
62 #include "isl/options.h"
65 using namespace polly
;
72 #include "polly/Support/PollyDebug.h"
73 #define DEBUG_TYPE "polly-opt-isl"
75 static cl::opt
<std::string
>
76 OptimizeDeps("polly-opt-optimize-only",
77 cl::desc("Only a certain kind of dependences (all/raw)"),
78 cl::Hidden
, cl::init("all"), cl::cat(PollyCategory
));
80 static cl::opt
<std::string
>
81 SimplifyDeps("polly-opt-simplify-deps",
82 cl::desc("Dependences should be simplified (yes/no)"),
83 cl::Hidden
, cl::init("yes"), cl::cat(PollyCategory
));
85 static cl::opt
<int> MaxConstantTerm(
86 "polly-opt-max-constant-term",
87 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden
,
88 cl::init(20), cl::cat(PollyCategory
));
90 static cl::opt
<int> MaxCoefficient(
91 "polly-opt-max-coefficient",
92 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden
,
93 cl::init(20), cl::cat(PollyCategory
));
95 static cl::opt
<std::string
>
96 MaximizeBandDepth("polly-opt-maximize-bands",
97 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden
,
98 cl::init("yes"), cl::cat(PollyCategory
));
101 ScheduleComputeOut("polly-schedule-computeout",
102 cl::desc("Bound the scheduler by maximal amount"
103 "of computational steps. "),
104 cl::Hidden
, cl::init(300000), cl::ZeroOrMore
,
105 cl::cat(PollyCategory
));
108 GreedyFusion("polly-loopfusion-greedy",
109 cl::desc("Aggressively try to fuse everything"), cl::Hidden
,
110 cl::cat(PollyCategory
));
112 static cl::opt
<std::string
> OuterCoincidence(
113 "polly-opt-outer-coincidence",
114 cl::desc("Try to construct schedules where the outer member of each band "
115 "satisfies the coincidence constraints (yes/no)"),
116 cl::Hidden
, cl::init("no"), cl::cat(PollyCategory
));
118 static cl::opt
<int> PrevectorWidth(
119 "polly-prevect-width",
121 "The number of loop iterations to strip-mine for pre-vectorization"),
122 cl::Hidden
, cl::init(4), cl::cat(PollyCategory
));
124 static cl::opt
<bool> FirstLevelTiling("polly-tiling",
125 cl::desc("Enable loop tiling"),
126 cl::init(true), cl::cat(PollyCategory
));
128 static cl::opt
<int> FirstLevelDefaultTileSize(
129 "polly-default-tile-size",
130 cl::desc("The default tile size (if not enough were provided by"
131 " --polly-tile-sizes)"),
132 cl::Hidden
, cl::init(32), cl::cat(PollyCategory
));
135 FirstLevelTileSizes("polly-tile-sizes",
136 cl::desc("A tile size for each loop dimension, filled "
137 "with --polly-default-tile-size"),
138 cl::Hidden
, cl::CommaSeparated
, cl::cat(PollyCategory
));
141 SecondLevelTiling("polly-2nd-level-tiling",
142 cl::desc("Enable a 2nd level loop of loop tiling"),
143 cl::cat(PollyCategory
));
145 static cl::opt
<int> SecondLevelDefaultTileSize(
146 "polly-2nd-level-default-tile-size",
147 cl::desc("The default 2nd-level tile size (if not enough were provided by"
148 " --polly-2nd-level-tile-sizes)"),
149 cl::Hidden
, cl::init(16), cl::cat(PollyCategory
));
152 SecondLevelTileSizes("polly-2nd-level-tile-sizes",
153 cl::desc("A tile size for each loop dimension, filled "
154 "with --polly-default-tile-size"),
155 cl::Hidden
, cl::CommaSeparated
,
156 cl::cat(PollyCategory
));
158 static cl::opt
<bool> RegisterTiling("polly-register-tiling",
159 cl::desc("Enable register tiling"),
160 cl::cat(PollyCategory
));
162 static cl::opt
<int> RegisterDefaultTileSize(
163 "polly-register-tiling-default-tile-size",
164 cl::desc("The default register tile size (if not enough were provided by"
165 " --polly-register-tile-sizes)"),
166 cl::Hidden
, cl::init(2), cl::cat(PollyCategory
));
169 RegisterTileSizes("polly-register-tile-sizes",
170 cl::desc("A tile size for each loop dimension, filled "
171 "with --polly-register-tile-size"),
172 cl::Hidden
, cl::CommaSeparated
, cl::cat(PollyCategory
));
174 static cl::opt
<bool> PragmaBasedOpts(
175 "polly-pragma-based-opts",
176 cl::desc("Apply user-directed transformation from metadata"),
177 cl::init(true), cl::cat(PollyCategory
));
179 static cl::opt
<bool> EnableReschedule("polly-reschedule",
180 cl::desc("Optimize SCoPs using ISL"),
181 cl::init(true), cl::cat(PollyCategory
));
184 PMBasedOpts("polly-pattern-matching-based-opts",
185 cl::desc("Perform optimizations based on pattern matching"),
186 cl::init(true), cl::cat(PollyCategory
));
189 EnablePostopts("polly-postopts",
190 cl::desc("Apply post-rescheduling optimizations such as "
191 "tiling (requires -polly-reschedule)"),
192 cl::init(true), cl::cat(PollyCategory
));
194 static cl::opt
<bool> OptimizedScops(
195 "polly-optimized-scops",
196 cl::desc("Polly - Dump polyhedral description of Scops optimized with "
197 "the isl scheduling optimizer and the set of post-scheduling "
198 "transformations is applied on the schedule tree"),
199 cl::cat(PollyCategory
));
201 STATISTIC(ScopsProcessed
, "Number of scops processed");
202 STATISTIC(ScopsRescheduled
, "Number of scops rescheduled");
203 STATISTIC(ScopsOptimized
, "Number of scops optimized");
205 STATISTIC(NumAffineLoopsOptimized
, "Number of affine loops optimized");
206 STATISTIC(NumBoxedLoopsOptimized
, "Number of boxed loops optimized");
208 #define THREE_STATISTICS(VARNAME, DESC) \
209 static Statistic VARNAME[3] = { \
210 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \
211 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \
212 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
214 THREE_STATISTICS(NumBands
, "Number of bands");
215 THREE_STATISTICS(NumBandMembers
, "Number of band members");
216 THREE_STATISTICS(NumCoincident
, "Number of coincident band members");
217 THREE_STATISTICS(NumPermutable
, "Number of permutable bands");
218 THREE_STATISTICS(NumFilters
, "Number of filter nodes");
219 THREE_STATISTICS(NumExtension
, "Number of extension nodes");
221 STATISTIC(FirstLevelTileOpts
, "Number of first level tiling applied");
222 STATISTIC(SecondLevelTileOpts
, "Number of second level tiling applied");
223 STATISTIC(RegisterTileOpts
, "Number of register tiling applied");
224 STATISTIC(PrevectOpts
, "Number of strip-mining for prevectorization applied");
225 STATISTIC(MatMulOpts
,
226 "Number of matrix multiplication patterns detected and optimized");
229 /// Additional parameters of the schedule optimizer.
231 /// Target Transform Info and the SCoP dependencies used by the schedule
233 struct OptimizerAdditionalInfoTy
{
234 const llvm::TargetTransformInfo
*TTI
;
235 const Dependences
*D
;
242 class ScheduleTreeOptimizer final
{
244 /// Apply schedule tree transformations.
246 /// This function takes an (possibly already optimized) schedule tree and
247 /// applies a set of additional optimizations on the schedule tree. The
248 /// transformations applied include:
250 /// - Pattern-based optimizations
252 /// - Prevectorization
254 /// @param Schedule The schedule object the transformations will be applied
256 /// @param OAI Target Transform Info and the SCoP dependencies.
257 /// @returns The transformed schedule.
259 optimizeSchedule(isl::schedule Schedule
,
260 const OptimizerAdditionalInfoTy
*OAI
= nullptr);
262 /// Apply schedule tree transformations.
264 /// This function takes a node in an (possibly already optimized) schedule
265 /// tree and applies a set of additional optimizations on this schedule tree
266 /// node and its descendants. The transformations applied include:
268 /// - Pattern-based optimizations
270 /// - Prevectorization
272 /// @param Node The schedule object post-transformations will be applied to.
273 /// @param OAI Target Transform Info and the SCoP dependencies.
274 /// @returns The transformed schedule.
275 static isl::schedule_node
276 optimizeScheduleNode(isl::schedule_node Node
,
277 const OptimizerAdditionalInfoTy
*OAI
= nullptr);
279 /// Decide if the @p NewSchedule is profitable for @p S.
281 /// @param S The SCoP we optimize.
282 /// @param NewSchedule The new schedule we computed.
284 /// @return True, if we believe @p NewSchedule is an improvement for @p S.
285 static bool isProfitableSchedule(polly::Scop
&S
, isl::schedule NewSchedule
);
287 /// Isolate a set of partial tile prefixes.
289 /// This set should ensure that it contains only partial tile prefixes that
290 /// have exactly VectorWidth iterations.
292 /// @param Node A schedule node band, which is a parent of a band node,
293 /// that contains a vector loop.
294 /// @return Modified isl_schedule_node.
295 static isl::schedule_node
isolateFullPartialTiles(isl::schedule_node Node
,
299 /// Check if this node is a band node we want to tile.
301 /// We look for innermost band nodes where individual dimensions are marked as
304 /// @param Node The node to check.
305 static bool isTileableBandNode(isl::schedule_node Node
);
307 /// Check if this node is a band node we want to transform using pattern
310 /// We look for innermost band nodes where individual dimensions are marked as
311 /// permutable. There is no restriction on the number of individual
314 /// @param Node The node to check.
315 static bool isPMOptimizableBandNode(isl::schedule_node Node
);
317 /// Pre-vectorizes one scheduling dimension of a schedule band.
319 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
320 /// sinks the resulting point loop.
322 /// Example (DimToVectorize=0, VectorWidth=4):
324 /// | Before transformation:
326 /// | A[i,j] -> [i,j]
328 /// | for (i = 0; i < 128; i++)
329 /// | for (j = 0; j < 128; j++)
332 /// | After transformation:
334 /// | for (it = 0; it < 32; it+=1)
335 /// | for (j = 0; j < 128; j++)
336 /// | for (ip = 0; ip <= 3; ip++)
337 /// | A(4 * it + ip,j);
339 /// The goal of this transformation is to create a trivially vectorizable
340 /// loop. This means a parallel loop at the innermost level that has a
341 /// constant number of iterations corresponding to the target vector width.
343 /// This transformation creates a loop at the innermost level. The loop has
344 /// a constant number of iterations, if the number of loop iterations at
345 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
346 /// currently constant and not yet target specific. This function does not
347 /// reason about parallelism.
348 static isl::schedule_node
prevectSchedBand(isl::schedule_node Node
,
349 unsigned DimToVectorize
,
352 /// Apply additional optimizations on the bands in the schedule tree.
354 /// We are looking for an innermost band node and apply the following
358 /// - if the band is tileable
359 /// - if the band has more than one loop dimension
361 /// - Prevectorize the schedule of the band (or the point loop in case of
363 /// - if vectorization is enabled
365 /// @param Node The schedule node to (possibly) optimize.
366 /// @param User A pointer to forward some use information
367 /// (currently unused).
368 static isl_schedule_node
*optimizeBand(isl_schedule_node
*Node
, void *User
);
370 /// Apply tiling optimizations on the bands in the schedule tree.
372 /// @param Node The schedule node to (possibly) optimize.
373 static isl::schedule_node
applyTileBandOpt(isl::schedule_node Node
);
375 /// Apply prevectorization on the bands in the schedule tree.
377 /// @param Node The schedule node to (possibly) prevectorize.
378 static isl::schedule_node
applyPrevectBandOpt(isl::schedule_node Node
);
382 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node
,
384 assert(isl_schedule_node_get_type(Node
.get()) == isl_schedule_node_band
);
385 Node
= Node
.child(0).child(0);
386 isl::union_map SchedRelUMap
= Node
.get_prefix_schedule_relation();
387 isl::union_set ScheduleRangeUSet
= SchedRelUMap
.range();
388 isl::set ScheduleRange
{ScheduleRangeUSet
};
389 isl::set IsolateDomain
= getPartialTilePrefixes(ScheduleRange
, VectorWidth
);
390 auto AtomicOption
= getDimOptions(IsolateDomain
.ctx(), "atomic");
391 isl::union_set IsolateOption
= getIsolateOptions(IsolateDomain
, 1);
392 Node
= Node
.parent().parent();
393 isl::union_set Options
= IsolateOption
.unite(AtomicOption
);
394 isl::schedule_node_band Result
=
395 Node
.as
<isl::schedule_node_band
>().set_ast_build_options(Options
);
399 struct InsertSimdMarkers final
: ScheduleNodeRewriter
<InsertSimdMarkers
> {
400 isl::schedule_node
visitBand(isl::schedule_node_band Band
) {
401 isl::schedule_node Node
= visitChildren(Band
);
403 // Only add SIMD markers to innermost bands.
404 if (!Node
.first_child().isa
<isl::schedule_node_leaf
>())
407 isl::id LoopMarker
= isl::id::alloc(Band
.ctx(), "SIMD", nullptr);
408 return Band
.insert_mark(LoopMarker
);
412 isl::schedule_node
ScheduleTreeOptimizer::prevectSchedBand(
413 isl::schedule_node Node
, unsigned DimToVectorize
, int VectorWidth
) {
414 assert(isl_schedule_node_get_type(Node
.get()) == isl_schedule_node_band
);
416 auto Space
= isl::manage(isl_schedule_node_band_get_space(Node
.get()));
417 unsigned ScheduleDimensions
= unsignedFromIslSize(Space
.dim(isl::dim::set
));
418 assert(DimToVectorize
< ScheduleDimensions
);
420 if (DimToVectorize
> 0) {
422 isl_schedule_node_band_split(Node
.release(), DimToVectorize
));
423 Node
= Node
.child(0);
425 if (DimToVectorize
< ScheduleDimensions
- 1)
426 Node
= isl::manage(isl_schedule_node_band_split(Node
.release(), 1));
427 Space
= isl::manage(isl_schedule_node_band_get_space(Node
.get()));
428 auto Sizes
= isl::multi_val::zero(Space
);
429 Sizes
= Sizes
.set_val(0, isl::val(Node
.ctx(), VectorWidth
));
431 isl::manage(isl_schedule_node_band_tile(Node
.release(), Sizes
.release()));
432 Node
= isolateFullPartialTiles(Node
, VectorWidth
);
433 Node
= Node
.child(0);
434 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
435 // we will have troubles to match it in the backend.
436 Node
= Node
.as
<isl::schedule_node_band
>().set_ast_build_options(
437 isl::union_set(Node
.ctx(), "{ unroll[x]: 1 = 0 }"));
439 // Sink the inner loop into the smallest possible statements to make them
440 // represent a single vector instruction if possible.
441 Node
= isl::manage(isl_schedule_node_band_sink(Node
.release()));
443 // Add SIMD markers to those vector statements.
444 InsertSimdMarkers SimdMarkerInserter
;
445 Node
= SimdMarkerInserter
.visit(Node
);
448 return Node
.parent();
451 static bool isSimpleInnermostBand(const isl::schedule_node
&Node
) {
452 assert(isl_schedule_node_get_type(Node
.get()) == isl_schedule_node_band
);
453 assert(isl_schedule_node_n_children(Node
.get()) == 1);
455 auto ChildType
= isl_schedule_node_get_type(Node
.child(0).get());
457 if (ChildType
== isl_schedule_node_leaf
)
460 if (ChildType
!= isl_schedule_node_sequence
)
463 auto Sequence
= Node
.child(0);
465 for (int c
= 0, nc
= isl_schedule_node_n_children(Sequence
.get()); c
< nc
;
467 auto Child
= Sequence
.child(c
);
468 if (isl_schedule_node_get_type(Child
.get()) != isl_schedule_node_filter
)
470 if (isl_schedule_node_get_type(Child
.child(0).get()) !=
471 isl_schedule_node_leaf
)
477 /// Check if this node is a band node, which has only one child.
479 /// @param Node The node to check.
480 static bool isOneTimeParentBandNode(isl::schedule_node Node
) {
481 if (isl_schedule_node_get_type(Node
.get()) != isl_schedule_node_band
)
484 if (isl_schedule_node_n_children(Node
.get()) != 1)
490 bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node
) {
491 if (!isOneTimeParentBandNode(Node
))
494 if (!isl_schedule_node_band_get_permutable(Node
.get()))
497 auto Space
= isl::manage(isl_schedule_node_band_get_space(Node
.get()));
499 if (unsignedFromIslSize(Space
.dim(isl::dim::set
)) <= 1u)
502 return isSimpleInnermostBand(Node
);
505 bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node
) {
506 if (!isOneTimeParentBandNode(Node
))
509 return Node
.child(0).isa
<isl::schedule_node_leaf
>();
512 __isl_give
isl::schedule_node
513 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node
) {
514 if (FirstLevelTiling
) {
515 Node
= tileNode(Node
, "1st level tiling", FirstLevelTileSizes
,
516 FirstLevelDefaultTileSize
);
517 FirstLevelTileOpts
++;
520 if (SecondLevelTiling
) {
521 Node
= tileNode(Node
, "2nd level tiling", SecondLevelTileSizes
,
522 SecondLevelDefaultTileSize
);
523 SecondLevelTileOpts
++;
526 if (RegisterTiling
) {
528 applyRegisterTiling(Node
, RegisterTileSizes
, RegisterDefaultTileSize
);
536 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node
) {
537 auto Space
= isl::manage(isl_schedule_node_band_get_space(Node
.get()));
538 int Dims
= unsignedFromIslSize(Space
.dim(isl::dim::set
));
540 for (int i
= Dims
- 1; i
>= 0; i
--)
541 if (Node
.as
<isl::schedule_node_band
>().member_get_coincident(i
)) {
542 Node
= prevectSchedBand(Node
, i
, PrevectorWidth
);
549 __isl_give isl_schedule_node
*
550 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node
*NodeArg
,
552 const OptimizerAdditionalInfoTy
*OAI
=
553 static_cast<const OptimizerAdditionalInfoTy
*>(User
);
554 assert(OAI
&& "Expecting optimization options");
556 isl::schedule_node Node
= isl::manage(NodeArg
);
558 if (OAI
->PatternOpts
&& isPMOptimizableBandNode(Node
)) {
559 isl::schedule_node PatternOptimizedSchedule
=
560 tryOptimizeMatMulPattern(Node
, OAI
->TTI
, OAI
->D
);
561 if (!PatternOptimizedSchedule
.is_null()) {
563 OAI
->DepsChanged
= true;
564 return PatternOptimizedSchedule
.release();
568 if (!isTileableBandNode(Node
))
569 return Node
.release();
572 Node
= applyTileBandOpt(Node
);
575 // FIXME: Prevectorization requirements are different from those checked by
576 // isTileableBandNode.
577 Node
= applyPrevectBandOpt(Node
);
580 return Node
.release();
584 ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule
,
585 const OptimizerAdditionalInfoTy
*OAI
) {
586 auto Root
= Schedule
.get_root();
587 Root
= optimizeScheduleNode(Root
, OAI
);
588 return Root
.get_schedule();
591 isl::schedule_node
ScheduleTreeOptimizer::optimizeScheduleNode(
592 isl::schedule_node Node
, const OptimizerAdditionalInfoTy
*OAI
) {
593 Node
= isl::manage(isl_schedule_node_map_descendant_bottom_up(
594 Node
.release(), optimizeBand
,
595 const_cast<void *>(static_cast<const void *>(OAI
))));
599 bool ScheduleTreeOptimizer::isProfitableSchedule(Scop
&S
,
600 isl::schedule NewSchedule
) {
601 // To understand if the schedule has been optimized we check if the schedule
602 // has changed at all.
603 // TODO: We can improve this by tracking if any necessarily beneficial
604 // transformations have been performed. This can e.g. be tiling, loop
605 // interchange, or ...) We can track this either at the place where the
606 // transformation has been performed or, in case of automatic ILP based
607 // optimizations, by comparing (yet to be defined) performance metrics
608 // before/after the scheduling optimizer
609 // (e.g., #stride-one accesses)
610 // FIXME: A schedule tree whose union_map-conversion is identical to the
611 // original schedule map may still allow for parallelization, i.e. can still
613 auto NewScheduleMap
= NewSchedule
.get_map();
614 auto OldSchedule
= S
.getSchedule();
615 assert(!OldSchedule
.is_null() &&
616 "Only IslScheduleOptimizer can insert extension nodes "
617 "that make Scop::getSchedule() return nullptr.");
618 bool changed
= !OldSchedule
.is_equal(NewScheduleMap
);
622 class IslScheduleOptimizerWrapperPass final
: public ScopPass
{
626 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID
) {}
628 /// Optimize the schedule of the SCoP @p S.
629 bool runOnScop(Scop
&S
) override
;
631 /// Print the new schedule for the SCoP @p S.
632 void printScop(raw_ostream
&OS
, Scop
&S
) const override
;
634 /// Register all analyses and transformation required.
635 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
637 /// Release the internal memory.
638 void releaseMemory() override
{
644 std::shared_ptr
<isl_ctx
> IslCtx
;
645 isl::schedule LastSchedule
;
648 char IslScheduleOptimizerWrapperPass::ID
= 0;
651 static void printSchedule(llvm::raw_ostream
&OS
, const isl::schedule
&Schedule
,
653 isl::ctx Ctx
= Schedule
.ctx();
654 isl_printer
*P
= isl_printer_to_str(Ctx
.get());
655 P
= isl_printer_set_yaml_style(P
, ISL_YAML_STYLE_BLOCK
);
656 P
= isl_printer_print_schedule(P
, Schedule
.get());
657 char *Str
= isl_printer_get_str(P
);
658 OS
<< Desc
<< ": \n" << Str
<< "\n";
664 /// Collect statistics for the schedule tree.
666 /// @param Schedule The schedule tree to analyze. If not a schedule tree it is
668 /// @param Version The version of the schedule tree that is analyzed.
669 /// 0 for the original schedule tree before any transformation.
670 /// 1 for the schedule tree after isl's rescheduling.
671 /// 2 for the schedule tree after optimizations are applied
672 /// (tiling, pattern matching)
673 static void walkScheduleTreeForStatistics(isl::schedule Schedule
, int Version
) {
674 auto Root
= Schedule
.get_root();
678 isl_schedule_node_foreach_descendant_top_down(
680 [](__isl_keep isl_schedule_node
*nodeptr
, void *user
) -> isl_bool
{
681 isl::schedule_node Node
= isl::manage_copy(nodeptr
);
682 int Version
= *static_cast<int *>(user
);
684 switch (isl_schedule_node_get_type(Node
.get())) {
685 case isl_schedule_node_band
: {
687 if (isl_schedule_node_band_get_permutable(Node
.get()) ==
689 NumPermutable
[Version
]++;
691 int CountMembers
= isl_schedule_node_band_n_member(Node
.get());
692 NumBandMembers
[Version
] += CountMembers
;
693 for (int i
= 0; i
< CountMembers
; i
+= 1) {
694 if (Node
.as
<isl::schedule_node_band
>().member_get_coincident(i
))
695 NumCoincident
[Version
]++;
700 case isl_schedule_node_filter
:
701 NumFilters
[Version
]++;
704 case isl_schedule_node_extension
:
705 NumExtension
[Version
]++;
712 return isl_bool_true
;
717 static void runIslScheduleOptimizer(
719 function_ref
<const Dependences
&(Dependences::AnalysisLevel
)> GetDeps
,
720 TargetTransformInfo
*TTI
, OptimizationRemarkEmitter
*ORE
,
721 isl::schedule
&LastSchedule
, bool &DepsChanged
) {
722 // Skip empty SCoPs but still allow code generation as it will delete the
723 // loops present but not needed.
724 if (S
.getSize() == 0) {
731 // Schedule without optimizations.
732 isl::schedule Schedule
= S
.getScheduleTree();
733 walkScheduleTreeForStatistics(S
.getScheduleTree(), 0);
734 POLLY_DEBUG(printSchedule(dbgs(), Schedule
, "Original schedule tree"));
736 bool HasUserTransformation
= false;
737 if (PragmaBasedOpts
) {
738 isl::schedule ManuallyTransformed
= applyManualTransformations(
739 &S
, Schedule
, GetDeps(Dependences::AL_Statement
), ORE
);
740 if (ManuallyTransformed
.is_null()) {
741 POLLY_DEBUG(dbgs() << "Error during manual optimization\n");
745 if (ManuallyTransformed
.get() != Schedule
.get()) {
746 // User transformations have precedence over other transformations.
747 HasUserTransformation
= true;
748 Schedule
= std::move(ManuallyTransformed
);
750 printSchedule(dbgs(), Schedule
, "After manual transformations"));
754 // Only continue if either manual transformations have been applied or we are
755 // allowed to apply heuristics.
756 // TODO: Detect disabled heuristics and no user-directed transformation
757 // metadata earlier in ScopDetection.
758 if (!HasUserTransformation
&& S
.hasDisableHeuristicsHint()) {
759 POLLY_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
763 // Get dependency analysis.
764 const Dependences
&D
= GetDeps(Dependences::AL_Statement
);
765 if (D
.getSharedIslCtx() != S
.getSharedIslCtx()) {
766 POLLY_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
769 if (!D
.hasValidDependences()) {
770 POLLY_DEBUG(dbgs() << "Dependency information not available\n");
774 // Apply ISL's algorithm only if not overriden by the user. Note that
775 // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
776 // rely on the coincidence/permutable annotations on schedule tree bands that
777 // are added by the rescheduling analyzer. Therefore, disabling the
778 // rescheduler implicitly also disables these optimizations.
779 if (!EnableReschedule
) {
780 POLLY_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
781 } else if (HasUserTransformation
) {
783 dbgs() << "Skipping rescheduling due to manual transformation\n");
787 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
790 if (OptimizeDeps
== "all")
792 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
793 else if (OptimizeDeps
== "raw")
794 ProximityKinds
= Dependences::TYPE_RAW
;
796 errs() << "Do not know how to optimize for '" << OptimizeDeps
<< "'"
797 << " Falling back to optimizing all dependences.\n";
799 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
802 isl::union_set Domain
= S
.getDomains();
804 if (Domain
.is_null())
807 isl::union_map Validity
= D
.getDependences(ValidityKinds
);
808 isl::union_map Proximity
= D
.getDependences(ProximityKinds
);
810 // Simplify the dependences by removing the constraints introduced by the
811 // domains. This can speed up the scheduling time significantly, as large
812 // constant coefficients will be removed from the dependences. The
813 // introduction of some additional dependences reduces the possible
814 // transformations, but in most cases, such transformation do not seem to be
815 // interesting anyway. In some cases this option may stop the scheduler to
816 // find any schedule.
817 if (SimplifyDeps
== "yes") {
818 Validity
= Validity
.gist_domain(Domain
);
819 Validity
= Validity
.gist_range(Domain
);
820 Proximity
= Proximity
.gist_domain(Domain
);
821 Proximity
= Proximity
.gist_range(Domain
);
822 } else if (SimplifyDeps
!= "no") {
824 << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
825 "or 'no'. Falling back to default: 'yes'\n";
828 POLLY_DEBUG(dbgs() << "\n\nCompute schedule from: ");
829 POLLY_DEBUG(dbgs() << "Domain := " << Domain
<< ";\n");
830 POLLY_DEBUG(dbgs() << "Proximity := " << Proximity
<< ";\n");
831 POLLY_DEBUG(dbgs() << "Validity := " << Validity
<< ";\n");
833 int IslMaximizeBands
;
834 if (MaximizeBandDepth
== "yes") {
835 IslMaximizeBands
= 1;
836 } else if (MaximizeBandDepth
== "no") {
837 IslMaximizeBands
= 0;
840 << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
841 " or 'no'. Falling back to default: 'yes'\n";
842 IslMaximizeBands
= 1;
845 int IslOuterCoincidence
;
846 if (OuterCoincidence
== "yes") {
847 IslOuterCoincidence
= 1;
848 } else if (OuterCoincidence
== "no") {
849 IslOuterCoincidence
= 0;
851 errs() << "warning: Option -polly-opt-outer-coincidence should either be "
852 "'yes' or 'no'. Falling back to default: 'no'\n";
853 IslOuterCoincidence
= 0;
856 isl_ctx
*Ctx
= S
.getIslCtx().get();
858 isl_options_set_schedule_outer_coincidence(Ctx
, IslOuterCoincidence
);
859 isl_options_set_schedule_maximize_band_depth(Ctx
, IslMaximizeBands
);
860 isl_options_set_schedule_max_constant_term(Ctx
, MaxConstantTerm
);
861 isl_options_set_schedule_max_coefficient(Ctx
, MaxCoefficient
);
862 isl_options_set_tile_scale_tile_loops(Ctx
, 0);
864 auto OnErrorStatus
= isl_options_get_on_error(Ctx
);
865 isl_options_set_on_error(Ctx
, ISL_ON_ERROR_CONTINUE
);
867 auto SC
= isl::schedule_constraints::on_domain(Domain
);
868 SC
= SC
.set_proximity(Proximity
);
869 SC
= SC
.set_validity(Validity
);
870 SC
= SC
.set_coincidence(Validity
);
873 IslMaxOperationsGuard
MaxOpGuard(Ctx
, ScheduleComputeOut
);
874 Schedule
= SC
.compute_schedule();
876 if (MaxOpGuard
.hasQuotaExceeded())
878 dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
881 isl_options_set_on_error(Ctx
, OnErrorStatus
);
884 POLLY_DEBUG(printSchedule(dbgs(), Schedule
, "After rescheduling"));
887 walkScheduleTreeForStatistics(Schedule
, 1);
889 // In cases the scheduler is not able to optimize the code, we just do not
890 // touch the schedule.
891 if (Schedule
.is_null())
895 isl::union_map Validity
= D
.getDependences(
896 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
);
897 Schedule
= applyGreedyFusion(Schedule
, Validity
);
898 assert(!Schedule
.is_null());
901 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
902 const OptimizerAdditionalInfoTy OAI
= {
904 const_cast<Dependences
*>(&D
),
905 /*PatternOpts=*/!HasUserTransformation
&& PMBasedOpts
,
906 /*Postopts=*/!HasUserTransformation
&& EnablePostopts
,
907 /*Prevect=*/PollyVectorizerChoice
!= VECTORIZER_NONE
,
909 if (OAI
.PatternOpts
|| OAI
.Postopts
|| OAI
.Prevect
) {
910 Schedule
= ScheduleTreeOptimizer::optimizeSchedule(Schedule
, &OAI
);
911 Schedule
= hoistExtensionNodes(Schedule
);
912 POLLY_DEBUG(printSchedule(dbgs(), Schedule
, "After post-optimizations"));
913 walkScheduleTreeForStatistics(Schedule
, 2);
916 // Skip profitability check if user transformation(s) have been applied.
917 if (!HasUserTransformation
&&
918 !ScheduleTreeOptimizer::isProfitableSchedule(S
, Schedule
))
921 auto ScopStats
= S
.getStatistics();
923 NumAffineLoopsOptimized
+= ScopStats
.NumAffineLoops
;
924 NumBoxedLoopsOptimized
+= ScopStats
.NumBoxedLoops
;
925 LastSchedule
= Schedule
;
927 S
.setScheduleTree(Schedule
);
934 bool IslScheduleOptimizerWrapperPass::runOnScop(Scop
&S
) {
937 Function
&F
= S
.getFunction();
938 IslCtx
= S
.getSharedIslCtx();
940 auto getDependences
=
941 [this](Dependences::AnalysisLevel
) -> const Dependences
& {
942 return getAnalysis
<DependenceInfo
>().getDependences(
943 Dependences::AL_Statement
);
945 OptimizationRemarkEmitter
&ORE
=
946 getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
947 TargetTransformInfo
*TTI
=
948 &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
950 bool DepsChanged
= false;
951 runIslScheduleOptimizer(S
, getDependences
, TTI
, &ORE
, LastSchedule
,
954 getAnalysis
<DependenceInfo
>().abandonDependences();
958 static void runScheduleOptimizerPrinter(raw_ostream
&OS
,
959 isl::schedule LastSchedule
) {
963 OS
<< "Calculated schedule:\n";
965 if (LastSchedule
.is_null()) {
970 p
= isl_printer_to_str(LastSchedule
.ctx().get());
971 p
= isl_printer_set_yaml_style(p
, ISL_YAML_STYLE_BLOCK
);
972 p
= isl_printer_print_schedule(p
, LastSchedule
.get());
973 ScheduleStr
= isl_printer_get_str(p
);
976 OS
<< ScheduleStr
<< "\n";
981 void IslScheduleOptimizerWrapperPass::printScop(raw_ostream
&OS
, Scop
&) const {
982 runScheduleOptimizerPrinter(OS
, LastSchedule
);
985 void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
986 AnalysisUsage
&AU
) const {
987 ScopPass::getAnalysisUsage(AU
);
988 AU
.addRequired
<DependenceInfo
>();
989 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
990 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
992 AU
.addPreserved
<DependenceInfo
>();
993 AU
.addPreserved
<OptimizationRemarkEmitterWrapperPass
>();
998 Pass
*polly::createIslScheduleOptimizerWrapperPass() {
999 return new IslScheduleOptimizerWrapperPass();
1002 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass
, "polly-opt-isl",
1003 "Polly - Optimize schedule of SCoP", false, false);
1004 INITIALIZE_PASS_DEPENDENCY(DependenceInfo
);
1005 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass
);
1006 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
);
1007 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
);
1008 INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass
, "polly-opt-isl",
1009 "Polly - Optimize schedule of SCoP", false, false)
1011 static llvm::PreservedAnalyses
1012 runIslScheduleOptimizerUsingNPM(Scop
&S
, ScopAnalysisManager
&SAM
,
1013 ScopStandardAnalysisResults
&SAR
, SPMUpdater
&U
,
1015 DependenceAnalysis::Result
&Deps
= SAM
.getResult
<DependenceAnalysis
>(S
, SAR
);
1016 auto GetDeps
= [&Deps
](Dependences::AnalysisLevel
) -> const Dependences
& {
1017 return Deps
.getDependences(Dependences::AL_Statement
);
1019 OptimizationRemarkEmitter
ORE(&S
.getFunction());
1020 TargetTransformInfo
*TTI
= &SAR
.TTI
;
1021 isl::schedule LastSchedule
;
1022 bool DepsChanged
= false;
1023 runIslScheduleOptimizer(S
, GetDeps
, TTI
, &ORE
, LastSchedule
, DepsChanged
);
1025 Deps
.abandonDependences();
1028 *OS
<< "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
1029 << S
.getName() << "' in function '" << S
.getFunction().getName()
1031 runScheduleOptimizerPrinter(*OS
, LastSchedule
);
1033 return PreservedAnalyses::all();
1036 llvm::PreservedAnalyses
1037 IslScheduleOptimizerPass::run(Scop
&S
, ScopAnalysisManager
&SAM
,
1038 ScopStandardAnalysisResults
&SAR
, SPMUpdater
&U
) {
1039 return runIslScheduleOptimizerUsingNPM(S
, SAM
, SAR
, U
, nullptr);
1042 llvm::PreservedAnalyses
1043 IslScheduleOptimizerPrinterPass::run(Scop
&S
, ScopAnalysisManager
&SAM
,
1044 ScopStandardAnalysisResults
&SAR
,
1046 return runIslScheduleOptimizerUsingNPM(S
, SAM
, SAR
, U
, &OS
);
1049 //===----------------------------------------------------------------------===//
1052 /// Print result from IslScheduleOptimizerWrapperPass.
1053 class IslScheduleOptimizerPrinterLegacyPass final
: public ScopPass
{
1057 IslScheduleOptimizerPrinterLegacyPass()
1058 : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
1059 explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream
&OS
)
1060 : ScopPass(ID
), OS(OS
) {}
1062 bool runOnScop(Scop
&S
) override
{
1063 IslScheduleOptimizerWrapperPass
&P
=
1064 getAnalysis
<IslScheduleOptimizerWrapperPass
>();
1066 OS
<< "Printing analysis '" << P
.getPassName() << "' for region: '"
1067 << S
.getRegion().getNameStr() << "' in function '"
1068 << S
.getFunction().getName() << "':\n";
1074 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
1075 ScopPass::getAnalysisUsage(AU
);
1076 AU
.addRequired
<IslScheduleOptimizerWrapperPass
>();
1077 AU
.setPreservesAll();
1081 llvm::raw_ostream
&OS
;
1084 char IslScheduleOptimizerPrinterLegacyPass::ID
= 0;
1087 Pass
*polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream
&OS
) {
1088 return new IslScheduleOptimizerPrinterLegacyPass(OS
);
1091 INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass
,
1092 "polly-print-opt-isl",
1093 "Polly - Print optimizer schedule of SCoP", false, false);
1094 INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass
)
1095 INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass
,
1096 "polly-print-opt-isl",
1097 "Polly - Print optimizer schedule of SCoP", false, false)