diff options
Diffstat (limited to 'mlir/lib')
| -rw-r--r-- | mlir/lib/AffineOps/AffineOps.cpp | 27 | ||||
| -rw-r--r-- | mlir/lib/Analysis/Utils.cpp | 56 | ||||
| -rw-r--r-- | mlir/lib/IR/AffineStructures.cpp | 1 | ||||
| -rw-r--r-- | mlir/lib/Transforms/DmaGeneration.cpp | 118 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LoopTiling.cpp | 7 | ||||
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 4 |
6 files changed, 128 insertions, 85 deletions
diff --git a/mlir/lib/AffineOps/AffineOps.cpp b/mlir/lib/AffineOps/AffineOps.cpp index 0dea1441b08..1859a640a45 100644 --- a/mlir/lib/AffineOps/AffineOps.cpp +++ b/mlir/lib/AffineOps/AffineOps.cpp @@ -1047,18 +1047,19 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp, LLVM_DEBUG(llvm::dbgs() << "Domain conservative: non-unit stride not handled\n"); + int64_t step = forOp->getStep(); + // Adds a lower or upper bound when the bounds aren't constant. auto addLowerOrUpperBound = [&](bool lower) -> bool { auto operands = lower ? forOp->getLowerBoundOperands() : forOp->getUpperBoundOperands(); for (const auto &operand : operands) { - unsigned loc; - if (!constraints->findId(*operand, &loc)) { + unsigned pos; + if (!constraints->findId(*operand, &pos)) { if (isValidSymbol(operand)) { constraints->addSymbolId(constraints->getNumSymbolIds(), const_cast<Value *>(operand)); - loc = - constraints->getNumDimIds() + constraints->getNumSymbolIds() - 1; + pos = constraints->getNumDimAndSymbolIds() - 1; // Check if the symbol is a constant. if (auto *opInst = operand->getDefiningInst()) { if (auto constOp = opInst->dyn_cast<ConstantIndexOp>()) { @@ -1068,17 +1069,22 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp, } else { constraints->addDimId(constraints->getNumDimIds(), const_cast<Value *>(operand)); - loc = constraints->getNumDimIds() - 1; + pos = constraints->getNumDimIds() - 1; + if (auto loop = getForInductionVarOwner(operand)) { + // Outer loop IVs could be used in forOp's bounds. + if (!addAffineForOpDomain(loop, constraints)) + return false; + } } } } // Record positions of the operands in the constraint system. SmallVector<unsigned, 8> positions; for (const auto &operand : operands) { - unsigned loc; - if (!constraints->findId(*operand, &loc)) + unsigned pos; + if (!constraints->findId(*operand, &pos)) assert(0 && "expected to be found"); - positions.push_back(loc); + positions.push_back(pos); } auto boundMap = @@ -1106,7 +1112,7 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp, ineq[constraints->getNumCols() - 1] = lower ? -flatExpr[flatExpr.size() - 1] // Upper bound in flattenedExpr is an exclusive one. - : flatExpr[flatExpr.size() - 1] - 1; + : flatExpr[flatExpr.size() - 1] - step; constraints->addInequality(ineq); } return true; @@ -1121,7 +1127,8 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp, } if (forOp->hasConstantUpperBound()) { - constraints->addConstantUpperBound(pos, forOp->getConstantUpperBound() - 1); + constraints->addConstantUpperBound(pos, + forOp->getConstantUpperBound() - step); return true; } // Non-constant upper bound case. diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 821f19df31d..3fbea075386 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -28,6 +28,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -35,6 +36,8 @@ using namespace mlir; +using llvm::SmallDenseMap; + /// Populates 'loops' with IVs of the loops surrounding 'inst' ordered from /// the outermost 'for' instruction to the innermost one. void mlir::getLoopIVs(const Instruction &inst, @@ -133,6 +136,9 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth, unsigned rank = access.getRank(); + LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *inst + << "depth: " << loopDepth << "\n";); + if (rank == 0) { SmallVector<OpPointer<AffineForOp>, 4> ivs; getLoopIVs(*inst, &ivs); @@ -607,36 +613,41 @@ unsigned mlir::getNumCommonSurroundingLoops(const Instruction &A, return numCommonLoops; } -Optional<int64_t> -mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp, - int memorySpace) { - return getMemoryFootprintBytes(*forOp->getBody(), memorySpace); -} +static Optional<int64_t> getMemoryFootprintBytes(const Block &block, + Block::const_iterator start, + Block::const_iterator end, + int memorySpace) { + SmallDenseMap<Value *, std::unique_ptr<MemRefRegion>, 4> regions; -Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block, - int memorySpace) { - std::vector<std::unique_ptr<MemRefRegion>> regions; + // Cast away constness since the walker uses non-const versions; but we + // guarantee that the visitor callback isn't mutating opInst. + auto *cStart = reinterpret_cast<Block::iterator *>(&start); + auto *cEnd = reinterpret_cast<Block::iterator *>(&end); // Walk this 'for' instruction to gather all memory regions. bool error = false; - const_cast<Block *>(&block)->walk([&](Instruction *opInst) { + const_cast<Block *>(&block)->walk(*cStart, *cEnd, [&](Instruction *opInst) { if (!opInst->isa<LoadOp>() && !opInst->isa<StoreOp>()) { // Neither load nor a store op. return; } - // TODO(bondhugula): eventually, we need to be performing a union across - // all regions for a given memref instead of creating one region per - // memory op. This way we would be allocating O(num of memref's) sets - // instead of O(num of load/store op's). + // Compute the memref region symbolic in any IVs enclosing this block. auto region = std::make_unique<MemRefRegion>(opInst->getLoc()); - if (!region->compute(opInst, /*loopDepth=*/0)) { - LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n"); - // TODO: stop the walk if an error occurred. + if (!region->compute(opInst, + /*loopDepth=*/getNestingDepth(*block.begin()))) { + opInst->emitError("Error obtaining memory region\n"); + error = true; + return; + } + auto it = regions.find(region->memref); + if (it == regions.end()) { + regions[region->memref] = std::move(region); + } else if (!it->second->unionBoundingBox(*region)) { + opInst->emitError("Error performing a union on a memory region\n"); error = true; return; } - regions.push_back(std::move(region)); }); if (error) @@ -644,10 +655,19 @@ Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block, int64_t totalSizeInBytes = 0; for (const auto ®ion : regions) { - auto size = region->getRegionSize(); + Optional<int64_t> size = region.second->getRegionSize(); if (!size.hasValue()) return None; totalSizeInBytes += size.getValue(); } return totalSizeInBytes; } + +Optional<int64_t> +mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp, + int memorySpace) { + auto *forInst = forOp->getInstruction(); + return ::getMemoryFootprintBytes( + *forInst->getBlock(), Block::const_iterator(forInst), + std::next(Block::const_iterator(forInst)), memorySpace); +} diff --git a/mlir/lib/IR/AffineStructures.cpp b/mlir/lib/IR/AffineStructures.cpp index 5179e935007..d043e78f059 100644 --- a/mlir/lib/IR/AffineStructures.cpp +++ b/mlir/lib/IR/AffineStructures.cpp @@ -2059,6 +2059,7 @@ bool FlatAffineConstraints::unionBoundingBox( auto extent = getConstantBoundOnDimSize(d, &lb, &lbDivisor); if (!extent.hasValue()) // TODO(bondhugula): symbolic extents when necessary. + // TODO(bondhugula): handle union if a dimension is unbounded. return false; otherLb.clear(); diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp index 06aa3758712..abf693657be 100644 --- a/mlir/lib/Transforms/DmaGeneration.cpp +++ b/mlir/lib/Transforms/DmaGeneration.cpp @@ -49,14 +49,18 @@ static llvm::cl::opt<unsigned> clFastMemoryCapacity( "Set fast memory space capacity in KiB (default: unlimited)"), llvm::cl::cat(clOptionsCategory)); -static const unsigned kDefaultFastMemorySpace = 1; - static llvm::cl::opt<unsigned> clFastMemorySpace( - "dma-fast-mem-space", llvm::cl::init(kDefaultFastMemorySpace), + "dma-fast-mem-space", llvm::cl::init(1), llvm::cl::desc( "Fast memory space identifier for DMA generation (default: 1)"), llvm::cl::cat(clOptionsCategory)); +static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( + "dma-skip-non-unit-stride-loops", llvm::cl::Hidden, llvm::cl::init(false), + llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " + "for DMA placement"), + llvm::cl::cat(clOptionsCategory)); + namespace { /// Generates DMAs for memref's living in 'slowMemorySpace' into newly created @@ -76,7 +80,7 @@ struct DmaGeneration : public FunctionPass { fastMemCapacityBytes(fastMemCapacityBytes) {} PassResult runOnFunction(Function *f) override; - bool runOnBlock(Block *block, uint64_t consumedCapacityBytes); + bool runOnBlock(Block *block); uint64_t runOnBlock(Block::iterator begin, Block::iterator end); bool generateDma(const MemRefRegion ®ion, Block *block, @@ -457,12 +461,10 @@ bool DmaGeneration::generateDma(const MemRefRegion ®ion, Block *block, /// `regions`; each region is either a sequence of one or more instructions /// starting and ending with a load or store op, or just a loop (which could /// have other loops nested within). Returns false on an error, true otherwise. -bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) { +bool DmaGeneration::runOnBlock(Block *block) { if (block->empty()) return true; - uint64_t priorConsumedCapacityBytes = consumedCapacityBytes; - // Every loop in the block starts and ends a region. A contiguous sequence of // operation instructions starting and ending with a load/store op is also // identified as a region. Straightline code (contiguous chunks of operation @@ -482,35 +484,37 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) { for (auto it = curBegin; it != block->end(); ++it) { if (auto forOp = it->dyn_cast<AffineForOp>()) { - // We'll assume for now that loops with steps are tiled loops, and so DMAs - // are not performed for that depth, but only further inside. - // If the memory footprint of the 'for' loop is higher than fast - // memory capacity (when provided), we recurse to DMA at an inner level - // until we find a depth at which footprint fits in the capacity. If the - // footprint can't be calcuated, we assume for now it fits. - // Returns true if the footprint is known to exceed capacity. auto exceedsCapacity = [&](OpPointer<AffineForOp> forOp) { - Optional<int64_t> footprint; - return ((footprint = getMemoryFootprintBytes(forOp, 0)).hasValue() && - consumedCapacityBytes + - static_cast<uint64_t>(footprint.getValue()) > + Optional<int64_t> footprint = + getMemoryFootprintBytes(forOp, + /*memorySpace=*/0); + return (footprint.hasValue() && + static_cast<uint64_t>(footprint.getValue()) > fastMemCapacityBytes); }; - if (forOp->getStep() != 1 || exceedsCapacity(forOp)) { - // We'll split and do the DMAs one or more levels inside for forInst - consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it); + // If the memory footprint of the 'for' loop is higher than fast + // memory capacity (when provided), we recurse to DMA at an inner level + // until we find a depth at which footprint fits in fast mem capacity. If + // the footprint can't be calculated, we assume for now it fits. Recurse + // inside if footprint for 'forOp' exceeds capacity, or when + // clSkipNonUnitStrideLoop is set and the step size is not one. + bool recurseInner = clSkipNonUnitStrideLoop ? forOp->getStep() != 1 + : exceedsCapacity(forOp); + if (recurseInner) { + // We'll recurse and do the DMAs at an inner level for 'forInst'. + runOnBlock(/*begin=*/curBegin, /*end=*/it); // Recurse onto the body of this loop. - runOnBlock(forOp->getBody(), consumedCapacityBytes); + runOnBlock(forOp->getBody()); // The next region starts right after the 'for' instruction. curBegin = std::next(it); } else { // We have enough capacity, i.e., DMAs will be computed for the portion - // of the block until 'it', and for the 'for' loop. For the - // latter, they are placed just before this loop (for incoming DMAs) and - // right after (for outgoing ones). - consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it); + // of the block until 'it', and for 'it', which is 'forOp'. Note that + // for the latter, the DMAs are placed just before this loop (for + // incoming DMAs) and right after (for outgoing ones). + runOnBlock(/*begin=*/curBegin, /*end=*/it); // Inner loop DMAs have their own scope - we don't thus update consumed // capacity. The footprint check above guarantees this inner loop's @@ -519,7 +523,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) { curBegin = std::next(it); } } else if (!it->isa<LoadOp>() && !it->isa<StoreOp>()) { - consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it); + runOnBlock(/*begin=*/curBegin, /*end=*/it); curBegin = std::next(it); } } @@ -528,29 +532,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) { if (curBegin != block->end()) { // Can't be a terminator because it would have been skipped above. assert(!curBegin->isKnownTerminator() && "can't be a terminator"); - consumedCapacityBytes += - runOnBlock(/*begin=*/curBegin, /*end=*/block->end()); - } - - if (llvm::DebugFlag) { - uint64_t thisBlockDmaSizeBytes = - consumedCapacityBytes - priorConsumedCapacityBytes; - if (thisBlockDmaSizeBytes > 0) { - emitNoteForBlock( - *block, - Twine(llvm::divideCeil(thisBlockDmaSizeBytes, 1024)) + - " KiB of DMA buffers in fast memory space for this block\n"); - } - } - - if (consumedCapacityBytes > fastMemCapacityBytes) { - StringRef str = "Total size of all DMA buffers' for this block " - "exceeds fast memory capacity\n"; - if (auto *inst = block->getContainingInst()) - inst->emitError(str); - else - block->getFunction()->emitError(str); - return false; + runOnBlock(/*begin=*/curBegin, /*end=*/block->end()); } return true; @@ -558,6 +540,9 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) { /// Generates DMAs for a contiguous sequence of instructions in `block` in the /// iterator range [begin, end). Returns the total size of the DMA buffers used. +// Since we generate alloc's and dealloc's for all DMA buffers (before and +// after the range of instructions resp), all of the fast memory capacity is +// assumed to be available. uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { if (begin == end) return 0; @@ -575,6 +560,9 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { writeRegions.clear(); fastBufferMap.clear(); + // To check for errors when walking the block. + bool error = false; + // Walk this range of instructions to gather all memory regions. block->walk(begin, end, [&](Instruction *opInst) { // Gather regions to allocate to buffers in faster memory space. @@ -598,6 +586,7 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) { LLVM_DEBUG( opInst->emitError("Non-constant memref sizes not yet supported")); + error = true; return; } } @@ -628,16 +617,25 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { LLVM_DEBUG(llvm::dbgs() << "Memory region bounding box failed; " "over-approximating to the entire memref\n"); + // If the union fails, we will overapproximate. if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) { LLVM_DEBUG(opInst->emitError( "Non-constant memref sizes not yet supported")); + error = true; + return true; } + it->second->getConstraints()->clearAndCopyFrom( + *region->getConstraints()); } return true; }; bool existsInRead = updateRegion(readRegions); + if (error) + return; bool existsInWrite = updateRegion(writeRegions); + if (error) + return; // Finally add it to the region list. if (region->isWrite() && !existsInWrite) { @@ -647,6 +645,12 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { } }); + if (error) { + begin->emitError( + "DMA generation failed for one or more memref's in this block\n"); + return 0; + } + uint64_t totalDmaBuffersSizeInBytes = 0; bool ret = true; auto processRegions = @@ -677,12 +681,22 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) { // For a range of operation instructions, a note will be emitted at the // caller. OpPointer<AffineForOp> forOp; + uint64_t sizeInKib = llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024); if (llvm::DebugFlag && (forOp = begin->dyn_cast<AffineForOp>())) { forOp->emitNote( - Twine(llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024)) + + Twine(sizeInKib) + " KiB of DMA buffers in fast memory space for this block\n"); } + if (totalDmaBuffersSizeInBytes > fastMemCapacityBytes) { + StringRef str = "Total size of all DMA buffers' for this block " + "exceeds fast memory capacity\n"; + if (auto *inst = block->getContainingInst()) + inst->emitError(str); + else + block->getFunction()->emitError(str); + } + return totalDmaBuffersSizeInBytes; } @@ -691,7 +705,7 @@ PassResult DmaGeneration::runOnFunction(Function *f) { zeroIndex = topBuilder.create<ConstantIndexOp>(f->getLoc(), 0); for (auto &block : *f) { - runOnBlock(&block, /*consumedCapacityBytes=*/0); + runOnBlock(&block); } // This function never leaves the IR in an invalid state. return success(); diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp index b86516f31f7..44798dcee85 100644 --- a/mlir/lib/Transforms/LoopTiling.cpp +++ b/mlir/lib/Transforms/LoopTiling.cpp @@ -37,7 +37,7 @@ using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); // List of tile sizes. If any of them aren't provided, they are filled with -// clTileSize. +// clTileSize / kDefaultTileSize. static llvm::cl::list<unsigned> clTileSizes( "tile-sizes", llvm::cl::desc( @@ -50,8 +50,8 @@ namespace { struct LoopTiling : public FunctionPass { LoopTiling() : FunctionPass(&LoopTiling::passID) {} PassResult runOnFunction(Function *f) override; - constexpr static unsigned kDefaultTileSize = 4; + constexpr static unsigned kDefaultTileSize = 4; static char passID; }; @@ -158,7 +158,8 @@ static void constructTiledIndexSetHyperRect( UtilResult mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band, ArrayRef<unsigned> tileSizes) { assert(!band.empty()); - assert(band.size() == tileSizes.size()); + assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes"); + // Check if the supplied for inst's are all successively nested. for (unsigned i = 1, e = band.size(); i < e; i++) { assert(band[i]->getInstruction()->getParentInst() == diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index a943ac280d1..a85f428bde6 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -261,7 +261,7 @@ PassResult PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) { auto mayBeConstTripCount = getConstantTripCount(forOp); if (!mayBeConstTripCount.hasValue()) { - LLVM_DEBUG(llvm::dbgs() << "unknown trip count loop\n"); + LLVM_DEBUG(forOp->emitNote("unknown trip count loop")); return success(); } @@ -269,7 +269,7 @@ PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) { findMatchingStartFinishInsts(forOp, startWaitPairs); if (startWaitPairs.empty()) { - LLVM_DEBUG(llvm::dbgs() << "No dma start/finish pairs\n";); + LLVM_DEBUG(forOp->emitNote("No dma start/finish pairs\n")); return success(); } |

