diff options
| author | Uday Bondhugula <bondhugula@google.com> | 2019-03-08 09:21:52 -0800 |
|---|---|---|
| committer | jpienaar <jpienaar@google.com> | 2019-03-29 17:06:51 -0700 |
| commit | ce7e59536c32e49f9f3ae8cc959571eb5367c45f (patch) | |
| tree | 12b16dbaac0981eabd1ca82faa9cd1dfb5e5bec5 /mlir/lib/Transforms/LoopTiling.cpp | |
| parent | 8b4b9b31f19846d9721b5cf9ec20d1e97a38707c (diff) | |
| download | bcm5719-llvm-ce7e59536c32e49f9f3ae8cc959571eb5367c45f.tar.gz bcm5719-llvm-ce7e59536c32e49f9f3ae8cc959571eb5367c45f.zip | |
Add a basic model to set tile sizes + some cleanup
- compute tile sizes based on a simple model that looks at memory footprints
(instead of using the hardcoded default value)
- adjust tile sizes to make them factors of trip counts based on an option
- update loop fusion CL options to allow setting maximal fusion at pass creation
- change an emitError to emitWarning (since it's not a hard error unless the client
treats it that way, in which case, it can emit one)
$ mlir-opt -debug-only=loop-tile -loop-tile test/Transforms/loop-tiling.mlir
test/Transforms/loop-tiling.mlir:81:3: note: using tile sizes [4 4 5 ]
for %i = 0 to 256 {
for %i0 = 0 to 256 step 4 {
for %i1 = 0 to 256 step 4 {
for %i2 = 0 to 250 step 5 {
for %i3 = #map4(%i0) to #map11(%i0) {
for %i4 = #map4(%i1) to #map11(%i1) {
for %i5 = #map4(%i2) to #map12(%i2) {
%0 = load %arg0[%i3, %i5] : memref<8x8xvector<64xf32>>
%1 = load %arg1[%i5, %i4] : memref<8x8xvector<64xf32>>
%2 = load %arg2[%i3, %i4] : memref<8x8xvector<64xf32>>
%3 = mulf %0, %1 : vector<64xf32>
%4 = addf %2, %3 : vector<64xf32>
store %4, %arg2[%i3, %i4] : memref<8x8xvector<64xf32>>
}
}
}
}
}
}
PiperOrigin-RevId: 237461836
Diffstat (limited to 'mlir/lib/Transforms/LoopTiling.cpp')
| -rw-r--r-- | mlir/lib/Transforms/LoopTiling.cpp | 164 |
1 files changed, 149 insertions, 15 deletions
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp index 7e345bf15fe..4fa9c5a44d3 100644 --- a/mlir/lib/Transforms/LoopTiling.cpp +++ b/mlir/lib/Transforms/LoopTiling.cpp @@ -23,12 +23,15 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/Utils.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include <iomanip> using namespace mlir; @@ -36,34 +39,53 @@ using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); +static llvm::cl::opt<unsigned long long> + clCacheSizeKiB("tile-cache-size", + llvm::cl::desc("Set size of cache to tile for in KiB"), + llvm::cl::cat(clOptionsCategory)); + +// Tile size to use for all loops (overrides -tile-sizes if provided). +static llvm::cl::opt<unsigned> + clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"), + llvm::cl::cat(clOptionsCategory)); + // List of tile sizes. If any of them aren't provided, they are filled with // clTileSize / kDefaultTileSize. static llvm::cl::list<unsigned> clTileSizes( "tile-sizes", llvm::cl::desc( - "List of tile sizes for each perfect nest (overrides -tile-size)"), + "List of tile sizes for each perfect nest (overridden by -tile-size)"), llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); namespace { /// A pass to perform loop tiling on all suitable loop nests of a Function. struct LoopTiling : public FunctionPass<LoopTiling> { + explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity, + bool avoidMaxMinBounds = true) + : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {} + void runOnFunction() override; + void getTileSizes(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes); + // Default tile size if nothing is provided. constexpr static unsigned kDefaultTileSize = 4; + constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL; + + // Capacity of the cache to tile for. + uint64_t cacheSizeBytes; + // If true, tile sizes are set to avoid max/min in bounds if possible. + bool avoidMaxMinBounds; }; } // end anonymous namespace -// Tile size to use for all loops (overridden by -tile-sizes if provided). -static llvm::cl::opt<unsigned> - clTileSize("tile-size", llvm::cl::init(LoopTiling::kDefaultTileSize), - llvm::cl::desc("Use this tile size for all loops"), - llvm::cl::cat(clOptionsCategory)); - -/// Creates a pass to perform loop tiling on all suitable loop nests of an +/// Creates a pass to perform loop tiling on all suitable loop nests of a /// Function. -FunctionPassBase *mlir::createLoopTilingPass() { return new LoopTiling(); } +FunctionPassBase *mlir::createLoopTilingPass(uint64_t cacheSizeBytes) { + return new LoopTiling(cacheSizeBytes); +} // Move the loop body of AffineForOp 'src' from 'src' into the specified // location in destination's body. @@ -213,7 +235,7 @@ Status mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band, getIndexSet(band, &cst); if (!cst.isHyperRectangular(0, width)) { - rootAffineForOp->emitError("tiled code generation unimplemented for the" + rootAffineForOp->emitError("tiled code generation unimplemented for the " "non-hyperrectangular case"); return Status::failure(); } @@ -253,18 +275,130 @@ getTileableBands(Function *f, getMaximalPerfectLoopNest(forOp); } +// Reduce each tile size to the largest divisor of the corresponding trip count +// (if the trip count is known). +static void adjustToDivisorsOfTripCounts(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes) { + assert(band.size() == tileSizes->size() && "invalid tile size count"); + for (unsigned i = 0, e = band.size(); i < e; i++) { + unsigned &tSizeAdjusted = (*tileSizes)[i]; + auto mayConst = getConstantTripCount(band[i]); + if (!mayConst.hasValue()) + continue; + // Adjust the tile size to largest factor of the trip count less than + // tSize. + uint64_t constTripCount = mayConst.getValue(); + if (tSizeAdjusted > constTripCount / 2) + tSizeAdjusted = constTripCount / 2; + while (constTripCount % tSizeAdjusted != 0) + tSizeAdjusted--; + } +} + +// Returns tile sizes to use. Checks CL options; if none are specified, sets it +// based on a simple model that looks at the memory footprint and determines +// tile sizes assuming identity accesses / 1:1 tile size proportional footprint +// along each of the dimensions being tiled. +// TODO(mlir-team): evolve this model. Tile size determination is a large area +// to play with in general. +void LoopTiling::getTileSizes(ArrayRef<OpPointer<AffineForOp>> band, + SmallVectorImpl<unsigned> *tileSizes) { + if (band.empty()) + return; + + tileSizes->resize(band.size()); + + // Use clTileSize for all loops if specified. + if (clTileSize.getNumOccurrences() > 0) { + std::fill(tileSizes->begin(), tileSizes->end(), clTileSize); + return; + } + + // Use clTileSizes and fill them with default tile size if it's short. + if (!clTileSizes.empty()) { + std::fill(tileSizes->begin(), tileSizes->end(), + LoopTiling::kDefaultTileSize); + std::copy(clTileSizes.begin(), + clTileSizes.begin() + std::min(clTileSizes.size(), band.size()), + tileSizes->begin()); + return; + } + + // The first loop in the band. + auto rootForOp = band[0]; + (void)rootForOp; + + // Obtain memory footprint and set tile sizes so that a tile fits in + // the cache size. This is an approximation with the assumption that the + // footprint increases with the tile size linearly in that dimension (i.e., + // assumes one-to-one access function). + auto fp = getMemoryFootprintBytes(band[0], 0); + if (!fp.hasValue()) { + // Fill with default tile sizes if footprint is unknown. + std::fill(tileSizes->begin(), tileSizes->end(), + LoopTiling::kDefaultTileSize); + if (avoidMaxMinBounds) + adjustToDivisorsOfTripCounts(band, tileSizes); + LLVM_DEBUG( + rootForOp->emitWarning("memory footprint unknown: using default tile " + "sizes adjusted to trip count divisors")); + return; + } + + // Check how many times larger the cache size is when compared to footprint. + uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes); + if (excessFactor <= 1) { + // No need of any tiling - set tile size to 1. + std::fill(tileSizes->begin(), tileSizes->end(), 1); + return; + } + + // Divide all loops equally in an attempt to reduce footprint. + // TODO(bondhugula): this is approximate. Ideally, obtain reuse factor / + // profitability along each dimension and weight tile sizes based on that as + // one possible approach. Or compute a polynomial in tile sizes and solve for + // it. + + // For an n-d tilable band, compute n^th root of the excess. + unsigned tSize = + static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size()))); + // We'll keep a running product to determine the last tile size better. + unsigned cumulProductOfTileSizes = 1; + for (unsigned i = 0, e = band.size(); i < e; i++) { + if (i < e - 1) + (*tileSizes)[i] = tSize; + else + // Set last tile size to cover the balance. + (*tileSizes)[i] = std::max(1UL, excessFactor / cumulProductOfTileSizes); + cumulProductOfTileSizes *= (*tileSizes)[i]; + } + if (avoidMaxMinBounds) + adjustToDivisorsOfTripCounts(band, tileSizes); +} + void LoopTiling::runOnFunction() { + // Override cache size if provided on command line. + if (clCacheSizeKiB.getNumOccurrences() > 0) + cacheSizeBytes = clCacheSizeKiB * 1024; + + // Bands of loops to tile. std::vector<SmallVector<OpPointer<AffineForOp>, 6>> bands; getTileableBands(getFunction(), &bands); for (auto &band : bands) { // Set up tile sizes; fill missing tile sizes at the end with default tile // size or clTileSize if one was provided. - SmallVector<unsigned, 6> tileSizes(band.size(), clTileSize); - std::copy(clTileSizes.begin(), - clTileSizes.begin() + std::min(clTileSizes.size(), band.size()), - tileSizes.begin()); - + SmallVector<unsigned, 6> tileSizes; + getTileSizes(band, &tileSizes); + if (llvm::DebugFlag) { + std::stringstream msg; + msg << "using tile sizes ["; + for (auto tSize : tileSizes) + msg << tSize << " "; + msg << "]\n"; + auto rootForOp = band[0]; + rootForOp->emitNote(msg.str()); + } if (failed(tileCodeGen(band, tileSizes))) return signalPassFailure(); } |

