diff options
Diffstat (limited to 'mlir/lib')
| -rw-r--r-- | mlir/lib/Analysis/AffineStructures.cpp | 2 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopFusion.cpp | 47 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopTiling.cpp | 164 |
3 files changed, 181 insertions, 32 deletions
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index df6776b8184..0687526ce73 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -736,7 +736,7 @@ FlatAffineConstraints::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp) { FlatAffineConstraints localVarCst; std::vector<SmallVector<int64_t, 8>> flatExprs; if (failed(getFlattenedAffineExprs(boundMap, &flatExprs, &localVarCst))) { - forOp->emitError("semi-affine expressions not yet supported"); + forOp->emitWarning("semi-affine expressions not yet supported"); return Status::failure(); } // Set values for localVarCst. diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 3786177f9ec..4e619ef36a4 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -48,7 +48,9 @@ using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); -/// Disables fusion profitability check and fuses if valid. +/// Disables fusion profitability check and fuses if valid. Ignore any +/// additional (redundant) computation tolerance threshold +/// that would have prevented fusion. static llvm::cl::opt<bool> clMaximalLoopFusion("fusion-maximal", llvm::cl::desc("Enables maximal loop fusion"), @@ -66,8 +68,8 @@ static llvm::cl::opt<unsigned> clFusionFastMemorySpace( llvm::cl::desc("Faster memory space number to promote fusion buffers to"), llvm::cl::cat(clOptionsCategory)); -// A local buffer of size less than or equal to this size is promoted to fast -// memory. +// A local buffer of size less than or equal to this size is automatically +// promoted to fast memory after producer-consumer fusion. static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold( "fusion-local-buf-threshold", llvm::cl::desc("Threshold size (KiB) for promoting local buffers to fast " @@ -86,9 +88,10 @@ namespace { // and add support for more general loop fusion algorithms. struct LoopFusion : public FunctionPass<LoopFusion> { - LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0) + LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0, + bool maximalFusion = false) : localBufSizeThreshold(localBufSizeThreshold), - fastMemorySpace(fastMemorySpace) {} + fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {} void runOnFunction() override; @@ -96,6 +99,9 @@ struct LoopFusion : public FunctionPass<LoopFusion> { // `fastMemorySpace` if provided. uint64_t localBufSizeThreshold; Optional<unsigned> fastMemorySpace = None; + // If true, ignore any additional (redundant) computation tolerance threshold + // that would have prevented fusion. + bool maximalFusion; // The amount of additional computation that is tolerated while fusing // pair-wise as a fraction of the total computation. @@ -105,8 +111,9 @@ struct LoopFusion : public FunctionPass<LoopFusion> { } // end anonymous namespace FunctionPassBase *mlir::createLoopFusionPass(unsigned fastMemorySpace, - uint64_t localBufSizeThreshold) { - return new LoopFusion(fastMemorySpace, localBufSizeThreshold); + uint64_t localBufSizeThreshold, + bool maximalFusion) { + return new LoopFusion(fastMemorySpace, localBufSizeThreshold, maximalFusion); } namespace { @@ -1411,7 +1418,7 @@ static bool isFusionProfitable(Instruction *srcOpInst, ArrayRef<Instruction *> dstLoadOpInsts, ArrayRef<Instruction *> dstStoreOpInsts, ComputationSliceState *sliceState, - unsigned *dstLoopDepth) { + unsigned *dstLoopDepth, bool maximalFusion) { LLVM_DEBUG({ llvm::dbgs() << "Checking whether fusion is profitable between:\n"; llvm::dbgs() << " " << *srcOpInst << " and \n"; @@ -1620,7 +1627,7 @@ static bool isFusionProfitable(Instruction *srcOpInst, // (as per computeToleranceThreshold), we will simply pick the one that // reduces the intermediary size the most. if ((storageReduction > maxStorageReduction) && - (clMaximalLoopFusion || + (maximalFusion || (additionalComputeFraction < computeToleranceThreshold))) { maxStorageReduction = storageReduction; bestDstLoopDepth = i; @@ -1632,7 +1639,7 @@ static bool isFusionProfitable(Instruction *srcOpInst, // A simple cost model: fuse if it reduces the memory footprint. If // -maximal-fusion is set, fuse nevertheless. - if (!clMaximalLoopFusion && !bestDstLoopDepth.hasValue()) { + if (!maximalFusion && !bestDstLoopDepth.hasValue()) { LLVM_DEBUG( llvm::dbgs() << "All fusion choices involve more than the threshold amount of " @@ -1661,7 +1668,7 @@ static bool isFusionProfitable(Instruction *srcOpInst, Optional<double> storageReduction = None; - if (!clMaximalLoopFusion) { + if (!maximalFusion) { if (!dstMemSize.hasValue() || !srcMemSize.hasValue()) { LLVM_DEBUG( llvm::dbgs() @@ -1785,13 +1792,16 @@ public: unsigned localBufSizeThreshold; // Parameter for fast memory space. Optional<unsigned> fastMemorySpace; + // If true, ignore any additional (redundant) computation tolerance threshold + // that would have prevented fusion. + bool maximalFusion; using Node = MemRefDependenceGraph::Node; GreedyFusion(MemRefDependenceGraph *mdg, unsigned localBufSizeThreshold, - Optional<unsigned> fastMemorySpace) + Optional<unsigned> fastMemorySpace, bool maximalFusion) : mdg(mdg), localBufSizeThreshold(localBufSizeThreshold), - fastMemorySpace(fastMemorySpace) {} + fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {} // Initializes 'worklist' with nodes from 'mdg' void init() { @@ -1917,7 +1927,7 @@ public: // Check if fusion would be profitable. if (!isFusionProfitable(srcStoreOpInst, srcStoreOpInst, dstLoadOpInsts, dstStoreOpInsts, &sliceState, - &bestDstLoopDepth)) + &bestDstLoopDepth, maximalFusion)) continue; // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'. @@ -2076,7 +2086,8 @@ public: // Check if fusion would be profitable. if (!isFusionProfitable(sibLoadOpInst, sibStoreOpInst, dstLoadOpInsts, - dstStoreOpInsts, &sliceState, &bestDstLoopDepth)) + dstStoreOpInsts, &sliceState, &bestDstLoopDepth, + maximalFusion)) continue; // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'. @@ -2231,9 +2242,13 @@ void LoopFusion::runOnFunction() { localBufSizeThreshold = clFusionLocalBufThreshold * 1024; } + if (clMaximalLoopFusion.getNumOccurrences() > 0) + maximalFusion = clMaximalLoopFusion; + MemRefDependenceGraph g; if (g.init(getFunction())) - GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace).run(); + GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace, maximalFusion) + .run(); } static PassRegistration<LoopFusion> pass("loop-fusion", "Fuse loop nests"); diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp index 7e345bf15fe..4fa9c5a44d3 100644 --- a/mlir/lib/Transforms/LoopTiling.cpp +++ b/mlir/lib/Transforms/LoopTiling.cpp @@ -23,12 +23,15 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/Utils.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include <iomanip> using namespace mlir; @@ -36,34 +39,53 @@ using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); +static llvm::cl::opt<unsigned long long> + clCacheSizeKiB("tile-cache-size", + llvm::cl::desc("Set size of cache to tile for in KiB"), + llvm::cl::cat(clOptionsCategory)); + +// Tile size to use for all loops (overrides -tile-sizes if provided). +static llvm::cl::opt<unsigned> + clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"), + llvm::cl::cat(clOptionsCategory)); + // List of tile sizes. If any of them aren't provided, they are filled with // clTileSize / kDefaultTileSize. static llvm::cl::list<unsigned> clTileSizes( "tile-sizes", llvm::cl::desc( - "List of tile sizes for each perfect nest (overrides -tile-size)"), + "List of tile sizes for each perfect nest (overridden by -tile-size)"), llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); namespace { /// A pass to perform loop tiling on all suitable loop nests of a Function. struct LoopTiling : public FunctionPass<LoopTiling> { + explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity, + bool avoidMaxMinBounds = true) + : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {} + void runOnFunction() override; + void getTileSizes(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes); + // Default tile size if nothing is provided. constexpr static unsigned kDefaultTileSize = 4; + constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL; + + // Capacity of the cache to tile for. + uint64_t cacheSizeBytes; + // If true, tile sizes are set to avoid max/min in bounds if possible. + bool avoidMaxMinBounds; }; } // end anonymous namespace -// Tile size to use for all loops (overridden by -tile-sizes if provided). -static llvm::cl::opt<unsigned> - clTileSize("tile-size", llvm::cl::init(LoopTiling::kDefaultTileSize), - llvm::cl::desc("Use this tile size for all loops"), - llvm::cl::cat(clOptionsCategory)); - -/// Creates a pass to perform loop tiling on all suitable loop nests of an +/// Creates a pass to perform loop tiling on all suitable loop nests of a /// Function. -FunctionPassBase *mlir::createLoopTilingPass() { return new LoopTiling(); } +FunctionPassBase *mlir::createLoopTilingPass(uint64_t cacheSizeBytes) { + return new LoopTiling(cacheSizeBytes); +} // Move the loop body of AffineForOp 'src' from 'src' into the specified // location in destination's body. @@ -213,7 +235,7 @@ Status mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band, getIndexSet(band, &cst); if (!cst.isHyperRectangular(0, width)) { - rootAffineForOp->emitError("tiled code generation unimplemented for the" + rootAffineForOp->emitError("tiled code generation unimplemented for the " "non-hyperrectangular case"); return Status::failure(); } @@ -253,18 +275,130 @@ getTileableBands(Function *f, getMaximalPerfectLoopNest(forOp); } +// Reduce each tile size to the largest divisor of the corresponding trip count +// (if the trip count is known). +static void adjustToDivisorsOfTripCounts(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes) { + assert(band.size() == tileSizes->size() && "invalid tile size count"); + for (unsigned i = 0, e = band.size(); i < e; i++) { + unsigned &tSizeAdjusted = (*tileSizes)[i]; + auto mayConst = getConstantTripCount(band[i]); + if (!mayConst.hasValue()) + continue; + // Adjust the tile size to largest factor of the trip count less than + // tSize. + uint64_t constTripCount = mayConst.getValue(); + if (tSizeAdjusted > constTripCount / 2) + tSizeAdjusted = constTripCount / 2; + while (constTripCount % tSizeAdjusted != 0) + tSizeAdjusted--; + } +} + +// Returns tile sizes to use. Checks CL options; if none are specified, sets it +// based on a simple model that looks at the memory footprint and determines +// tile sizes assuming identity accesses / 1:1 tile size proportional footprint +// along each of the dimensions being tiled. +// TODO(mlir-team): evolve this model. Tile size determination is a large area +// to play with in general. +void LoopTiling::getTileSizes(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes) { + if (band.empty()) + return; + + tileSizes->resize(band.size()); + + // Use clTileSize for all loops if specified. + if (clTileSize.getNumOccurrences() > 0) { + std::fill(tileSizes->begin(), tileSizes->end(), clTileSize); + return; + } + + // Use clTileSizes and fill them with default tile size if it's short. + if (!clTileSizes.empty()) { + std::fill(tileSizes->begin(), tileSizes->end(), + LoopTiling::kDefaultTileSize); + std::copy(clTileSizes.begin(), + clTileSizes.begin() + std::min(clTileSizes.size(), band.size()), + tileSizes->begin()); + return; + } + + // The first loop in the band. + auto rootForOp = band[0]; + (void)rootForOp; + + // Obtain memory footprint and set tile sizes so that a tile fits in + // the cache size. This is an approximation with the assumption that the + // footprint increases with the tile size linearly in that dimension (i.e., + // assumes one-to-one access function). + auto fp = getMemoryFootprintBytes(band[0], 0); + if (!fp.hasValue()) { + // Fill with default tile sizes if footprint is unknown. + std::fill(tileSizes->begin(), tileSizes->end(), + LoopTiling::kDefaultTileSize); + if (avoidMaxMinBounds) + adjustToDivisorsOfTripCounts(band, tileSizes); + LLVM_DEBUG( + rootForOp->emitWarning("memory footprint unknown: using default tile " + "sizes adjusted to trip count divisors")); + return; + } + + // Check how many times larger the cache size is when compared to footprint. + uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes); + if (excessFactor <= 1) { + // No need of any tiling - set tile size to 1. + std::fill(tileSizes->begin(), tileSizes->end(), 1); + return; + } + + // Divide all loops equally in an attempt to reduce footprint. + // TODO(bondhugula): this is approximate. Ideally, obtain reuse factor / + // profitability along each dimension and weight tile sizes based on that as + // one possible approach. Or compute a polynomial in tile sizes and solve for + // it. + + // For an n-d tilable band, compute n^th root of the excess. + unsigned tSize = + static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size()))); + // We'll keep a running product to determine the last tile size better. + unsigned cumulProductOfTileSizes = 1; + for (unsigned i = 0, e = band.size(); i < e; i++) { + if (i < e - 1) + (*tileSizes)[i] = tSize; + else + // Set last tile size to cover the balance. + (*tileSizes)[i] = std::max(1UL, excessFactor / cumulProductOfTileSizes); + cumulProductOfTileSizes *= (*tileSizes)[i]; + } + if (avoidMaxMinBounds) + adjustToDivisorsOfTripCounts(band, tileSizes); +} + void LoopTiling::runOnFunction() { + // Override cache size if provided on command line. + if (clCacheSizeKiB.getNumOccurrences() > 0) + cacheSizeBytes = clCacheSizeKiB * 1024; + + // Bands of loops to tile. std::vector<SmallVector<OpPointer<AffineForOp>, 6>> bands; getTileableBands(getFunction(), &bands); for (auto &band : bands) { // Set up tile sizes; fill missing tile sizes at the end with default tile // size or clTileSize if one was provided. - SmallVector<unsigned, 6> tileSizes(band.size(), clTileSize); - std::copy(clTileSizes.begin(), - clTileSizes.begin() + std::min(clTileSizes.size(), band.size()), - tileSizes.begin()); - + SmallVector<unsigned, 6> tileSizes; + getTileSizes(band, &tileSizes); + if (llvm::DebugFlag) { + std::stringstream msg; + msg << "using tile sizes ["; + for (auto tSize : tileSizes) + msg << tSize << " "; + msg << "]\n"; + auto rootForOp = band[0]; + rootForOp->emitNote(msg.str()); + } if (failed(tileCodeGen(band, tileSizes))) return signalPassFailure(); } |

