//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===// // // Copyright 2019 The MLIR Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // // This file implements loop unroll and jam. Unroll and jam is a transformation // that improves locality, in particular, register reuse, while also improving // instruction level parallelism. The example below shows what it does in nearly // the general case. Loop unroll and jam currently works if the bounds of the // loops inner to the loop being unroll-jammed do not depend on the latter. // // Before After unroll and jam of i by factor 2: // // for i, step = 2 // for i S1(i); // S1; S2(i); // S2; S1(i+1); // for j S2(i+1); // S3; for j // S4; S3(i, j); // S5; S4(i, j); // S6; S3(i+1, j) // S4(i+1, j) // S5(i); // S6(i); // S5(i+1); // S6(i+1); // // Note: 'if/else' blocks are not jammed. So, if there are loops inside if // inst's, bodies of those loops will not be jammed. //===----------------------------------------------------------------------===// #include "mlir/Transforms/Passes.h" #include "mlir/AffineOps/AffineOps.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" using namespace mlir; #define DEBUG_TYPE "loop-unroll-jam" static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); // Loop unroll and jam factor. static llvm::cl::opt clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden, llvm::cl::desc("Use this unroll jam factor for all loops" " (default 4)"), llvm::cl::cat(clOptionsCategory)); namespace { /// Loop unroll jam pass. Currently, this just unroll jams the first /// outer loop in a Function. struct LoopUnrollAndJam : public FunctionPass { Optional unrollJamFactor; static const unsigned kDefaultUnrollJamFactor = 4; explicit LoopUnrollAndJam(Optional unrollJamFactor = None) : FunctionPass(&LoopUnrollAndJam::passID), unrollJamFactor(unrollJamFactor) {} PassResult runOnFunction(Function *f) override; bool runOnAffineForOp(OpPointer forOp); static char passID; }; } // end anonymous namespace char LoopUnrollAndJam::passID = 0; FunctionPass *mlir::createLoopUnrollAndJamPass(int unrollJamFactor) { return new LoopUnrollAndJam( unrollJamFactor == -1 ? None : Optional(unrollJamFactor)); } PassResult LoopUnrollAndJam::runOnFunction(Function *f) { // Currently, just the outermost loop from the first loop nest is // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on // any for Inst. auto &entryBlock = f->front(); if (!entryBlock.empty()) if (auto forOp = entryBlock.front().dyn_cast()) runOnAffineForOp(forOp); return success(); } /// Unroll and jam a 'for' inst. Default unroll jam factor is /// kDefaultUnrollJamFactor. Return false if nothing was done. bool LoopUnrollAndJam::runOnAffineForOp(OpPointer forOp) { // Unroll and jam by the factor that was passed if any. if (unrollJamFactor.hasValue()) return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue()); // Otherwise, unroll jam by the command-line factor if one was specified. if (clUnrollJamFactor.getNumOccurrences() > 0) return loopUnrollJamByFactor(forOp, clUnrollJamFactor); // Unroll and jam by four otherwise. return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor); } bool mlir::loopUnrollJamUpToFactor(OpPointer forOp, uint64_t unrollJamFactor) { Optional mayBeConstantTripCount = getConstantTripCount(forOp); if (mayBeConstantTripCount.hasValue() && mayBeConstantTripCount.getValue() < unrollJamFactor) return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue()); return loopUnrollJamByFactor(forOp, unrollJamFactor); } /// Unrolls and jams this loop by the specified factor. bool mlir::loopUnrollJamByFactor(OpPointer forOp, uint64_t unrollJamFactor) { // Gathers all maximal sub-blocks of instructions that do not themselves // include a for inst (a instruction could have a descendant for inst though // in its tree). struct JamBlockGatherer { // Store iterators to the first and last inst of each sub-block found. std::vector> subBlocks; // This is a linear time walk. void walk(Instruction *inst) { for (auto &blockList : inst->getBlockLists()) for (auto &block : blockList) walk(block); } void walk(Block &block) { for (auto it = block.begin(), e = block.end(); it != e;) { auto subBlockStart = it; while (it != e && !it->isa()) ++it; if (it != subBlockStart) subBlocks.push_back({subBlockStart, std::prev(it)}); // Process all for insts that appear next. while (it != e && it->isa()) walk(&*it++); } } }; assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1"); if (unrollJamFactor == 1 || forOp->getBody()->empty()) return false; Optional mayBeConstantTripCount = getConstantTripCount(forOp); if (!mayBeConstantTripCount.hasValue() && getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) return false; auto lbMap = forOp->getLowerBoundMap(); auto ubMap = forOp->getUpperBoundMap(); // Loops with max/min expressions won't be unrolled here (the output can't be // expressed as a Function in the general case). However, the right way to // do such unrolling for a Function would be to specialize the loop for the // 'hotspot' case and unroll that hotspot. if (lbMap.getNumResults() != 1 || ubMap.getNumResults() != 1) return false; // Same operand list for lower and upper bound for now. // TODO(bondhugula): handle bounds with different sets of operands. if (!forOp->matchingBoundOperandList()) return false; // If the trip count is lower than the unroll jam factor, no unroll jam. // TODO(bondhugula): option to specify cleanup loop unrolling. if (mayBeConstantTripCount.hasValue() && mayBeConstantTripCount.getValue() < unrollJamFactor) return false; auto *forInst = forOp->getInstruction(); // Gather all sub-blocks to jam upon the loop being unrolled. JamBlockGatherer jbg; jbg.walk(forInst); auto &subBlocks = jbg.subBlocks; // Generate the cleanup loop if trip count isn't a multiple of // unrollJamFactor. if (mayBeConstantTripCount.hasValue() && mayBeConstantTripCount.getValue() % unrollJamFactor != 0) { // Insert the cleanup loop right after 'forOp'. FuncBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst))); auto cleanupAffineForOp = builder.clone(*forInst)->cast(); cleanupAffineForOp->setLowerBoundMap( getCleanupLoopLowerBound(forOp, unrollJamFactor, &builder)); // The upper bound needs to be adjusted. forOp->setUpperBoundMap( getUnrolledLoopUpperBound(forOp, unrollJamFactor, &builder)); // Promote the loop body up if this has turned into a single iteration loop. promoteIfSingleIteration(cleanupAffineForOp); } // Scale the step of loop being unroll-jammed by the unroll-jam factor. int64_t step = forOp->getStep(); forOp->setStep(step * unrollJamFactor); auto *forOpIV = forOp->getInductionVar(); for (auto &subBlock : subBlocks) { // Builder to insert unroll-jammed bodies. Insert right at the end of // sub-block. FuncBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); // Unroll and jam (appends unrollJamFactor-1 additional copies). for (unsigned i = 1; i < unrollJamFactor; i++) { BlockAndValueMapping operandMapping; // If the induction variable is used, create a remapping to the value for // this unrolled instance. if (!forOpIV->use_empty()) { // iv' = iv + i, i = 1 to unrollJamFactor-1. auto d0 = builder.getAffineDimExpr(0); auto bumpMap = builder.getAffineMap(1, 0, {d0 + i * step}, {}); auto ivUnroll = builder.create(forInst->getLoc(), bumpMap, forOpIV); operandMapping.map(forOpIV, ivUnroll); } // Clone the sub-block being unroll-jammed. for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) { builder.clone(*it, operandMapping); } } } // Promote the loop body up if this has turned into a single iteration loop. promoteIfSingleIteration(forOp); return true; } static PassRegistration pass("loop-unroll-jam", "Unroll and jam loops");