[MemProf] Templatize CallStackRadixTreeBuilder (NFC) (#117014)
[llvm-project.git] / flang / lib / Optimizer / Transforms / LoopVersioning.cpp
blobadc39861840ab1eb1ef1a091f5152a9f7b7a498e
1 //===- LoopVersioning.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
10 /// \file
11 /// This pass looks for loops iterating over assumed-shape arrays, that can
12 /// be optimized by "guessing" that the stride is element-sized.
13 ///
14 /// This is done by creating two versions of the same loop: one which assumes
15 /// that the elements are contiguous (stride == size of element), and one that
16 /// is the original generic loop.
17 ///
18 /// As a side-effect of the assumed element size stride, the array is also
19 /// flattened to make it a 1D array - this is because the internal array
20 /// structure must be either 1D or have known sizes in all dimensions - and at
21 /// least one of the dimensions here is already unknown.
22 ///
23 /// There are two distinct benefits here:
24 /// 1. The loop that iterates over the elements is somewhat simplified by the
25 /// constant stride calculation.
26 /// 2. Since the compiler can understand the size of the stride, it can use
27 /// vector instructions, where an unknown (at compile time) stride does often
28 /// prevent vector operations from being used.
29 ///
30 /// A known drawback is that the code-size is increased, in some cases that can
31 /// be quite substantial - 3-4x is quite plausible (this includes that the loop
32 /// gets vectorized, which in itself often more than doubles the size of the
33 /// code, because unless the loop size is known, there will be a modulo
34 /// vector-size remainder to deal with.
35 ///
36 /// TODO: Do we need some size limit where loops no longer get duplicated?
37 // Maybe some sort of cost analysis.
38 /// TODO: Should some loop content - for example calls to functions and
39 /// subroutines inhibit the versioning of the loops. Plausibly, this
40 /// could be part of the cost analysis above.
41 //===----------------------------------------------------------------------===//
43 #include "flang/ISO_Fortran_binding_wrapper.h"
44 #include "flang/Optimizer/Builder/BoxValue.h"
45 #include "flang/Optimizer/Builder/FIRBuilder.h"
46 #include "flang/Optimizer/Builder/Runtime/Inquiry.h"
47 #include "flang/Optimizer/Dialect/FIRDialect.h"
48 #include "flang/Optimizer/Dialect/FIROps.h"
49 #include "flang/Optimizer/Dialect/FIRType.h"
50 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
51 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
52 #include "flang/Optimizer/Support/DataLayout.h"
53 #include "flang/Optimizer/Transforms/Passes.h"
54 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
55 #include "mlir/IR/Dominance.h"
56 #include "mlir/IR/Matchers.h"
57 #include "mlir/IR/TypeUtilities.h"
58 #include "mlir/Pass/Pass.h"
59 #include "mlir/Transforms/DialectConversion.h"
60 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
61 #include "mlir/Transforms/RegionUtils.h"
62 #include "llvm/Support/Debug.h"
63 #include "llvm/Support/raw_ostream.h"
65 #include <algorithm>
67 namespace fir {
68 #define GEN_PASS_DEF_LOOPVERSIONING
69 #include "flang/Optimizer/Transforms/Passes.h.inc"
70 } // namespace fir
72 #define DEBUG_TYPE "flang-loop-versioning"
74 namespace {
76 class LoopVersioningPass
77 : public fir::impl::LoopVersioningBase<LoopVersioningPass> {
78 public:
79 void runOnOperation() override;
82 /// @struct ArgInfo
83 /// A structure to hold an argument, the size of the argument and dimension
84 /// information.
85 struct ArgInfo {
86 mlir::Value arg;
87 size_t size;
88 unsigned rank;
89 fir::BoxDimsOp dims[CFI_MAX_RANK];
92 /// @struct ArgsUsageInLoop
93 /// A structure providing information about the function arguments
94 /// usage by the instructions immediately nested in a loop.
95 struct ArgsUsageInLoop {
96 /// Mapping between the memref operand of an array indexing
97 /// operation (e.g. fir.coordinate_of) and the argument information.
98 llvm::DenseMap<mlir::Value, ArgInfo> usageInfo;
99 /// Some array indexing operations inside a loop cannot be transformed.
100 /// This vector holds the memref operands of such operations.
101 /// The vector is used to make sure that we do not try to transform
102 /// any outer loop, since this will imply the operation rewrite
103 /// in this loop.
104 llvm::SetVector<mlir::Value> cannotTransform;
106 // Debug dump of the structure members assuming that
107 // the information has been collected for the given loop.
108 void dump(fir::DoLoopOp loop) const {
109 LLVM_DEBUG({
110 mlir::OpPrintingFlags printFlags;
111 printFlags.skipRegions();
112 llvm::dbgs() << "Arguments usage info for loop:\n";
113 loop.print(llvm::dbgs(), printFlags);
114 llvm::dbgs() << "\nUsed args:\n";
115 for (auto &use : usageInfo) {
116 mlir::Value v = use.first;
117 v.print(llvm::dbgs(), printFlags);
118 llvm::dbgs() << "\n";
120 llvm::dbgs() << "\nCannot transform args:\n";
121 for (mlir::Value arg : cannotTransform) {
122 arg.print(llvm::dbgs(), printFlags);
123 llvm::dbgs() << "\n";
125 llvm::dbgs() << "====\n";
129 // Erase usageInfo and cannotTransform entries for a set
130 // of given arguments.
131 void eraseUsage(const llvm::SetVector<mlir::Value> &args) {
132 for (auto &arg : args)
133 usageInfo.erase(arg);
134 cannotTransform.set_subtract(args);
137 // Erase usageInfo and cannotTransform entries for a set
138 // of given arguments provided in the form of usageInfo map.
139 void eraseUsage(const llvm::DenseMap<mlir::Value, ArgInfo> &args) {
140 for (auto &arg : args) {
141 usageInfo.erase(arg.first);
142 cannotTransform.remove(arg.first);
146 } // namespace
148 static fir::SequenceType getAsSequenceType(mlir::Value *v) {
149 mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
150 return mlir::dyn_cast<fir::SequenceType>(argTy);
153 /// if a value comes from a fir.declare, follow it to the original source,
154 /// otherwise return the value
155 static mlir::Value unwrapFirDeclare(mlir::Value val) {
156 // fir.declare is for source code variables. We don't have declares of
157 // declares
158 if (fir::DeclareOp declare = val.getDefiningOp<fir::DeclareOp>())
159 return declare.getMemref();
160 return val;
163 /// if a value comes from a fir.rebox, follow the rebox to the original source,
164 /// of the value, otherwise return the value
165 static mlir::Value unwrapReboxOp(mlir::Value val) {
166 // don't support reboxes of reboxes
167 if (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>())
168 val = rebox.getBox();
169 return val;
172 /// normalize a value (removing fir.declare and fir.rebox) so that we can
173 /// more conveniently spot values which came from function arguments
174 static mlir::Value normaliseVal(mlir::Value val) {
175 return unwrapFirDeclare(unwrapReboxOp(val));
178 /// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift.
179 /// fir.shift and fir.shapeshift allow us to extract lower bounds
180 /// if lowerbounds cannot be found, return nullptr
181 static mlir::Value tryGetLowerBoundsFromShapeLike(mlir::Value shapeLike,
182 unsigned dim) {
183 mlir::Value lowerBound{nullptr};
184 if (auto shift = shapeLike.getDefiningOp<fir::ShiftOp>())
185 lowerBound = shift.getOrigins()[dim];
186 if (auto shapeShift = shapeLike.getDefiningOp<fir::ShapeShiftOp>())
187 lowerBound = shapeShift.getOrigins()[dim];
188 return lowerBound;
191 /// attempt to get the array lower bounds of dimension dim of the memref
192 /// argument to a fir.array_coor op
193 /// 0 <= dim < rank
194 /// May return nullptr if no lower bounds can be determined
195 static mlir::Value getLowerBound(fir::ArrayCoorOp coop, unsigned dim) {
196 // 1) try to get from the shape argument to fir.array_coor
197 if (mlir::Value shapeLike = coop.getShape())
198 if (mlir::Value lb = tryGetLowerBoundsFromShapeLike(shapeLike, dim))
199 return lb;
201 // It is important not to try to read the lower bound from the box, because
202 // in the FIR lowering, boxes will sometimes contain incorrect lower bound
203 // information
205 // out of ideas
206 return {};
209 /// gets the i'th index from array coordinate operation op
210 /// dim should range between 0 and rank - 1
211 static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op,
212 unsigned dim) {
213 if (fir::CoordinateOp coop = mlir::dyn_cast<fir::CoordinateOp>(op))
214 return coop.getCoor()[dim];
216 fir::ArrayCoorOp coop = mlir::dyn_cast<fir::ArrayCoorOp>(op);
217 assert(coop &&
218 "operation must be either fir.coordiante_of or fir.array_coor");
220 // fir.coordinate_of indices start at 0: adjust these indices to match by
221 // subtracting the lower bound
222 mlir::Value index = coop.getIndices()[dim];
223 mlir::Value lb = getLowerBound(coop, dim);
224 if (!lb)
225 // assume a default lower bound of one
226 lb = builder.createIntegerConstant(coop.getLoc(), index.getType(), 1);
228 // index_0 = index - lb;
229 if (lb.getType() != index.getType())
230 lb = builder.createConvert(coop.getLoc(), index.getType(), lb);
231 return builder.create<mlir::arith::SubIOp>(coop.getLoc(), index, lb);
234 void LoopVersioningPass::runOnOperation() {
235 LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
236 mlir::func::FuncOp func = getOperation();
238 // First look for arguments with assumed shape = unknown extent in the lowest
239 // dimension.
240 LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
241 mlir::Block::BlockArgListType args = func.getArguments();
242 mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
243 fir::KindMapping kindMap = fir::getKindMapping(module);
244 mlir::SmallVector<ArgInfo, 4> argsOfInterest;
245 std::optional<mlir::DataLayout> dl =
246 fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
247 if (!dl)
248 mlir::emitError(module.getLoc(),
249 "data layout attribute is required to perform " DEBUG_TYPE
250 "pass");
251 for (auto &arg : args) {
252 // Optional arguments must be checked for IsPresent before
253 // looking for the bounds. They are unsupported for the time being.
254 if (func.getArgAttrOfType<mlir::UnitAttr>(arg.getArgNumber(),
255 fir::getOptionalAttrName())) {
256 LLVM_DEBUG(llvm::dbgs() << "OPTIONAL is not supported\n");
257 continue;
260 if (auto seqTy = getAsSequenceType(&arg)) {
261 unsigned rank = seqTy.getDimension();
262 if (rank > 0 &&
263 seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
264 size_t typeSize = 0;
265 mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
266 if (mlir::isa<mlir::FloatType>(elementType) ||
267 mlir::isa<mlir::IntegerType>(elementType) ||
268 mlir::isa<mlir::ComplexType>(elementType)) {
269 auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
270 arg.getLoc(), elementType, *dl, kindMap);
271 typeSize = llvm::alignTo(eleSize, eleAlign);
273 if (typeSize)
274 argsOfInterest.push_back({arg, typeSize, rank, {}});
275 else
276 LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");
281 if (argsOfInterest.empty()) {
282 LLVM_DEBUG(llvm::dbgs()
283 << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n");
284 return;
287 // A list of all loops in the function in post-order.
288 mlir::SmallVector<fir::DoLoopOp> originalLoops;
289 // Information about the arguments usage by the instructions
290 // immediately nested in a loop.
291 llvm::DenseMap<fir::DoLoopOp, ArgsUsageInLoop> argsInLoops;
293 auto &domInfo = getAnalysis<mlir::DominanceInfo>();
295 // Traverse the loops in post-order and see
296 // if those arguments are used inside any loop.
297 func.walk([&](fir::DoLoopOp loop) {
298 mlir::Block &body = *loop.getBody();
299 auto &argsInLoop = argsInLoops[loop];
300 originalLoops.push_back(loop);
301 body.walk([&](mlir::Operation *op) {
302 // Support either fir.array_coor or fir.coordinate_of.
303 if (!mlir::isa<fir::ArrayCoorOp, fir::CoordinateOp>(op))
304 return;
305 // Process only operations immediately nested in the current loop.
306 if (op->getParentOfType<fir::DoLoopOp>() != loop)
307 return;
308 mlir::Value operand = op->getOperand(0);
309 for (auto a : argsOfInterest) {
310 if (a.arg == normaliseVal(operand)) {
311 // Use the reboxed value, not the block arg when re-creating the loop.
312 a.arg = operand;
314 // Check that the operand dominates the loop?
315 // If this is the case, record such operands in argsInLoop.cannot-
316 // Transform, so that they disable the transformation for the parent
317 /// loops as well.
318 if (!domInfo.dominates(a.arg, loop))
319 argsInLoop.cannotTransform.insert(a.arg);
321 // No support currently for sliced arrays.
322 // This means that we cannot transform properly
323 // instructions referencing a.arg in the whole loop
324 // nest this loop is located in.
325 if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op))
326 if (arrayCoor.getSlice())
327 argsInLoop.cannotTransform.insert(a.arg);
329 if (argsInLoop.cannotTransform.contains(a.arg)) {
330 // Remove any previously recorded usage, if any.
331 argsInLoop.usageInfo.erase(a.arg);
332 break;
335 // Record the a.arg usage, if not recorded yet.
336 argsInLoop.usageInfo.try_emplace(a.arg, a);
337 break;
343 // Dump loops info after initial collection.
344 LLVM_DEBUG({
345 llvm::dbgs() << "Initial usage info:\n";
346 for (fir::DoLoopOp loop : originalLoops) {
347 auto &argsInLoop = argsInLoops[loop];
348 argsInLoop.dump(loop);
352 // Clear argument usage for parent loops if an inner loop
353 // contains a non-transformable usage.
354 for (fir::DoLoopOp loop : originalLoops) {
355 auto &argsInLoop = argsInLoops[loop];
356 if (argsInLoop.cannotTransform.empty())
357 continue;
359 fir::DoLoopOp parent = loop;
360 while ((parent = parent->getParentOfType<fir::DoLoopOp>()))
361 argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform);
364 // If an argument access can be optimized in a loop and
365 // its descendant loop, then it does not make sense to
366 // generate the contiguity check for the descendant loop.
367 // The check will be produced as part of the ancestor
368 // loop's transformation. So we can clear the argument
369 // usage for all descendant loops.
370 for (fir::DoLoopOp loop : originalLoops) {
371 auto &argsInLoop = argsInLoops[loop];
372 if (argsInLoop.usageInfo.empty())
373 continue;
375 loop.getBody()->walk([&](fir::DoLoopOp dloop) {
376 argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo);
380 LLVM_DEBUG({
381 llvm::dbgs() << "Final usage info:\n";
382 for (fir::DoLoopOp loop : originalLoops) {
383 auto &argsInLoop = argsInLoops[loop];
384 argsInLoop.dump(loop);
388 // Reduce the collected information to a list of loops
389 // with attached arguments usage information.
390 // The list must hold the loops in post order, so that
391 // the inner loops are transformed before the outer loops.
392 struct OpsWithArgs {
393 mlir::Operation *op;
394 mlir::SmallVector<ArgInfo, 4> argsAndDims;
396 mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
397 for (fir::DoLoopOp loop : originalLoops) {
398 auto &argsInLoop = argsInLoops[loop];
399 if (argsInLoop.usageInfo.empty())
400 continue;
401 OpsWithArgs info;
402 info.op = loop;
403 for (auto &arg : argsInLoop.usageInfo)
404 info.argsAndDims.push_back(arg.second);
405 loopsOfInterest.emplace_back(std::move(info));
408 if (loopsOfInterest.empty()) {
409 LLVM_DEBUG(llvm::dbgs()
410 << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n");
411 return;
414 // If we get here, there are loops to process.
415 fir::FirOpBuilder builder{module, std::move(kindMap)};
416 mlir::Location loc = builder.getUnknownLoc();
417 mlir::IndexType idxTy = builder.getIndexType();
419 LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:");
420 LLVM_DEBUG(module->dump());
422 LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size()
423 << "\n");
424 for (auto op : loopsOfInterest) {
425 LLVM_DEBUG(op.op->dump());
426 builder.setInsertionPoint(op.op);
428 mlir::Value allCompares = nullptr;
429 // Ensure all of the arrays are unit-stride.
430 for (auto &arg : op.argsAndDims) {
431 // Fetch all the dimensions of the array, except the last dimension.
432 // Always fetch the first dimension, however, so set ndims = 1 if
433 // we have one dim
434 unsigned ndims = arg.rank;
435 for (unsigned i = 0; i < ndims; i++) {
436 mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
437 arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
438 arg.arg, dimIdx);
440 // We only care about lowest order dimension, here.
441 mlir::Value elemSize =
442 builder.createIntegerConstant(loc, idxTy, arg.size);
443 mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
444 loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2),
445 elemSize);
446 if (!allCompares) {
447 allCompares = cmp;
448 } else {
449 allCompares =
450 builder.create<mlir::arith::AndIOp>(loc, cmp, allCompares);
454 auto ifOp =
455 builder.create<fir::IfOp>(loc, op.op->getResultTypes(), allCompares,
456 /*withElse=*/true);
457 builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
459 LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n");
460 mlir::Operation *clonedLoop = op.op->clone();
461 bool changed = false;
462 for (auto &arg : op.argsAndDims) {
463 fir::SequenceType::Shape newShape;
464 newShape.push_back(fir::SequenceType::getUnknownExtent());
465 auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg.getType());
466 mlir::Type arrTy = fir::SequenceType::get(newShape, elementType);
467 mlir::Type boxArrTy = fir::BoxType::get(arrTy);
468 mlir::Type refArrTy = builder.getRefType(arrTy);
469 auto carg = builder.create<fir::ConvertOp>(loc, boxArrTy, arg.arg);
470 auto caddr = builder.create<fir::BoxAddrOp>(loc, refArrTy, carg);
471 auto insPt = builder.saveInsertionPoint();
472 // Use caddr instead of arg.
473 clonedLoop->walk([&](mlir::Operation *coop) {
474 if (!mlir::isa<fir::CoordinateOp, fir::ArrayCoorOp>(coop))
475 return;
476 // Reduce the multi-dimensioned index to a single index.
477 // This is required becase fir arrays do not support multiple dimensions
478 // with unknown dimensions at compile time.
479 // We then calculate the multidimensional array like this:
480 // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
481 // where stride is the distance between elements in the dimensions
482 // 0, 1 and 2 or x, y and z.
483 if (coop->getOperand(0) == arg.arg && coop->getOperands().size() >= 2) {
484 builder.setInsertionPoint(coop);
485 mlir::Value totalIndex;
486 for (unsigned i = arg.rank - 1; i > 0; i--) {
487 mlir::Value curIndex =
488 builder.createConvert(loc, idxTy, getIndex(builder, coop, i));
489 // Multiply by the stride of this array. Later we'll divide by the
490 // element size.
491 mlir::Value scale =
492 builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
493 curIndex =
494 builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
495 totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
496 loc, curIndex, totalIndex)
497 : curIndex;
499 // This is the lowest dimension - which doesn't need scaling
500 mlir::Value finalIndex =
501 builder.createConvert(loc, idxTy, getIndex(builder, coop, 0));
502 if (totalIndex) {
503 assert(llvm::isPowerOf2_32(arg.size) &&
504 "Expected power of two here");
505 unsigned bits = llvm::Log2_32(arg.size);
506 mlir::Value elemShift =
507 builder.createIntegerConstant(loc, idxTy, bits);
508 totalIndex = builder.create<mlir::arith::AddIOp>(
509 loc,
510 builder.create<mlir::arith::ShRSIOp>(loc, totalIndex,
511 elemShift),
512 finalIndex);
513 } else {
514 totalIndex = finalIndex;
516 auto newOp = builder.create<fir::CoordinateOp>(
517 loc, builder.getRefType(elementType), caddr,
518 mlir::ValueRange{totalIndex});
519 LLVM_DEBUG(newOp->dump());
520 coop->getResult(0).replaceAllUsesWith(newOp->getResult(0));
521 coop->erase();
522 changed = true;
526 builder.restoreInsertionPoint(insPt);
528 assert(changed && "Expected operations to have changed");
530 builder.insert(clonedLoop);
531 // Forward the result(s), if any, from the loop operation to the
533 mlir::ResultRange results = clonedLoop->getResults();
534 bool hasResults = (results.size() > 0);
535 if (hasResults)
536 builder.create<fir::ResultOp>(loc, results);
538 // Add the original loop in the else-side of the if operation.
539 builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
540 op.op->replaceAllUsesWith(ifOp);
541 op.op->remove();
542 builder.insert(op.op);
543 // Rely on "cloned loop has results, so original loop also has results".
544 if (hasResults) {
545 builder.create<fir::ResultOp>(loc, op.op->getResults());
546 } else {
547 // Use an assert to check this.
548 assert(op.op->getResults().size() == 0 &&
549 "Weird, the cloned loop doesn't have results, but the original "
550 "does?");
554 LLVM_DEBUG(llvm::dbgs() << "After transform:\n");
555 LLVM_DEBUG(module->dump());
557 LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");