diff options
Diffstat (limited to 'mlir/lib/Transforms/LoopUnrollAndJam.cpp')
-rw-r--r-- | mlir/lib/Transforms/LoopUnrollAndJam.cpp | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp new file mode 100644 index 00000000000..6c74d545497 --- /dev/null +++ b/mlir/lib/Transforms/LoopUnrollAndJam.cpp @@ -0,0 +1,235 @@ +//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements loop unroll and jam. Unroll and jam is a transformation +// that improves locality, in particular, register reuse, while also improving +// operation level parallelism. The example below shows what it does in nearly +// the general case. Loop unroll and jam currently works if the bounds of the +// loops inner to the loop being unroll-jammed do not depend on the latter. +// +// Before After unroll and jam of i by factor 2: +// +// for i, step = 2 +// for i S1(i); +// S1; S2(i); +// S2; S1(i+1); +// for j S2(i+1); +// S3; for j +// S4; S3(i, j); +// S5; S4(i, j); +// S6; S3(i+1, j) +// S4(i+1, j) +// S5(i); +// S6(i); +// S5(i+1); +// S6(i+1); +// +// Note: 'if/else' blocks are not jammed. So, if there are loops inside if +// op's, bodies of those loops will not be jammed. +//===----------------------------------------------------------------------===// +#include "mlir/Transforms/Passes.h" + +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/CommandLine.h" + +using namespace mlir; + +#define DEBUG_TYPE "affine-loop-unroll-jam" + +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); + +// Loop unroll and jam factor. +static llvm::cl::opt<unsigned> + clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden, + llvm::cl::desc("Use this unroll jam factor for all loops" + " (default 4)"), + llvm::cl::cat(clOptionsCategory)); + +namespace { +/// Loop unroll jam pass. Currently, this just unroll jams the first +/// outer loop in a Function. +struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> { + Optional<unsigned> unrollJamFactor; + static const unsigned kDefaultUnrollJamFactor = 4; + + explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None) + : unrollJamFactor(unrollJamFactor) {} + + void runOnFunction() override; + LogicalResult runOnAffineForOp(AffineForOp forOp); +}; +} // end anonymous namespace + +std::unique_ptr<OpPassBase<FuncOp>> +mlir::createLoopUnrollAndJamPass(int unrollJamFactor) { + return std::make_unique<LoopUnrollAndJam>( + unrollJamFactor == -1 ? None : Optional<unsigned>(unrollJamFactor)); +} + +void LoopUnrollAndJam::runOnFunction() { + // Currently, just the outermost loop from the first loop nest is + // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on + // any for operation. + auto &entryBlock = getFunction().front(); + if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front())) + runOnAffineForOp(forOp); +} + +/// Unroll and jam a 'affine.for' op. Default unroll jam factor is +/// kDefaultUnrollJamFactor. Return failure if nothing was done. +LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) { + // Unroll and jam by the factor that was passed if any. + if (unrollJamFactor.hasValue()) + return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue()); + // Otherwise, unroll jam by the command-line factor if one was specified. + if (clUnrollJamFactor.getNumOccurrences() > 0) + return loopUnrollJamByFactor(forOp, clUnrollJamFactor); + + // Unroll and jam by four otherwise. + return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor); +} + +LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, + uint64_t unrollJamFactor) { + Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); + + if (mayBeConstantTripCount.hasValue() && + mayBeConstantTripCount.getValue() < unrollJamFactor) + return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue()); + return loopUnrollJamByFactor(forOp, unrollJamFactor); +} + +/// Unrolls and jams this loop by the specified factor. +LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp, + uint64_t unrollJamFactor) { + // Gathers all maximal sub-blocks of operations that do not themselves + // include a for op (a operation could have a descendant for op though + // in its tree). Ignore the block terminators. + struct JamBlockGatherer { + // Store iterators to the first and last op of each sub-block found. + std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks; + + // This is a linear time walk. + void walk(Operation *op) { + for (auto ®ion : op->getRegions()) + for (auto &block : region) + walk(block); + } + void walk(Block &block) { + for (auto it = block.begin(), e = std::prev(block.end()); it != e;) { + auto subBlockStart = it; + while (it != e && !isa<AffineForOp>(&*it)) + ++it; + if (it != subBlockStart) + subBlocks.push_back({subBlockStart, std::prev(it)}); + // Process all for insts that appear next. + while (it != e && isa<AffineForOp>(&*it)) + walk(&*it++); + } + } + }; + + assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1"); + + if (unrollJamFactor == 1) + return promoteIfSingleIteration(forOp); + + if (forOp.getBody()->empty() || + forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) + return failure(); + + // Loops where both lower and upper bounds are multi-result maps won't be + // unrolled (since the trip can't be expressed as an affine function in + // general). + // TODO(mlir-team): this may not be common, but we could support the case + // where the lower bound is a multi-result map and the ub is a single result + // one. + if (forOp.getLowerBoundMap().getNumResults() != 1) + return failure(); + + Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); + // If the trip count is lower than the unroll jam factor, no unroll jam. + if (mayBeConstantTripCount.hasValue() && + mayBeConstantTripCount.getValue() < unrollJamFactor) + return failure(); + + auto *forInst = forOp.getOperation(); + + // Gather all sub-blocks to jam upon the loop being unrolled. + JamBlockGatherer jbg; + jbg.walk(forInst); + auto &subBlocks = jbg.subBlocks; + + // Generate the cleanup loop if trip count isn't a multiple of + // unrollJamFactor. + if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { + // Insert the cleanup loop right after 'forOp'. + OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst))); + auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst)); + // Adjust the lower bound of the cleanup loop; its upper bound is the same + // as the original loop's upper bound. + AffineMap cleanupMap; + SmallVector<Value, 4> cleanupOperands; + getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap, + &cleanupOperands, builder); + cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap); + + // Promote the cleanup loop if it has turned into a single iteration loop. + promoteIfSingleIteration(cleanupAffineForOp); + + // Adjust the upper bound of the original loop - it will be the same as the + // cleanup loop's lower bound. Its lower bound remains unchanged. + forOp.setUpperBound(cleanupOperands, cleanupMap); + } + + // Scale the step of loop being unroll-jammed by the unroll-jam factor. + int64_t step = forOp.getStep(); + forOp.setStep(step * unrollJamFactor); + + auto forOpIV = forOp.getInductionVar(); + // Unroll and jam (appends unrollJamFactor - 1 additional copies). + for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { + // Operand map persists across all sub-blocks. + BlockAndValueMapping operandMapping; + for (auto &subBlock : subBlocks) { + // Builder to insert unroll-jammed bodies. Insert right at the end of + // sub-block. + OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); + + // If the induction variable is used, create a remapping to the value for + // this unrolled instance. + if (!forOpIV->use_empty()) { + // iv' = iv + i, i = 1 to unrollJamFactor-1. + auto d0 = builder.getAffineDimExpr(0); + auto bumpMap = AffineMap::get(1, 0, {d0 + i * step}); + auto ivUnroll = + builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV); + operandMapping.map(forOpIV, ivUnroll); + } + // Clone the sub-block being unroll-jammed. + for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) { + builder.clone(*it, operandMapping); + } + } + } + + // Promote the loop body up if this has turned into a single iteration loop. + promoteIfSingleIteration(forOp); + return success(); +} + +static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam", + "Unroll and jam loops"); |