summaryrefslogtreecommitdiffstats
path: root/mlir/lib
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib')
-rw-r--r--mlir/lib/AffineOps/AffineOps.cpp27
-rw-r--r--mlir/lib/Analysis/Utils.cpp56
-rw-r--r--mlir/lib/IR/AffineStructures.cpp1
-rw-r--r--mlir/lib/Transforms/DmaGeneration.cpp118
-rw-r--r--mlir/lib/Transforms/LoopTiling.cpp7
-rw-r--r--mlir/lib/Transforms/PipelineDataTransfer.cpp4
6 files changed, 128 insertions, 85 deletions
diff --git a/mlir/lib/AffineOps/AffineOps.cpp b/mlir/lib/AffineOps/AffineOps.cpp
index 0dea1441b08..1859a640a45 100644
--- a/mlir/lib/AffineOps/AffineOps.cpp
+++ b/mlir/lib/AffineOps/AffineOps.cpp
@@ -1047,18 +1047,19 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
LLVM_DEBUG(llvm::dbgs()
<< "Domain conservative: non-unit stride not handled\n");
+ int64_t step = forOp->getStep();
+
// Adds a lower or upper bound when the bounds aren't constant.
auto addLowerOrUpperBound = [&](bool lower) -> bool {
auto operands =
lower ? forOp->getLowerBoundOperands() : forOp->getUpperBoundOperands();
for (const auto &operand : operands) {
- unsigned loc;
- if (!constraints->findId(*operand, &loc)) {
+ unsigned pos;
+ if (!constraints->findId(*operand, &pos)) {
if (isValidSymbol(operand)) {
constraints->addSymbolId(constraints->getNumSymbolIds(),
const_cast<Value *>(operand));
- loc =
- constraints->getNumDimIds() + constraints->getNumSymbolIds() - 1;
+ pos = constraints->getNumDimAndSymbolIds() - 1;
// Check if the symbol is a constant.
if (auto *opInst = operand->getDefiningInst()) {
if (auto constOp = opInst->dyn_cast<ConstantIndexOp>()) {
@@ -1068,17 +1069,22 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
} else {
constraints->addDimId(constraints->getNumDimIds(),
const_cast<Value *>(operand));
- loc = constraints->getNumDimIds() - 1;
+ pos = constraints->getNumDimIds() - 1;
+ if (auto loop = getForInductionVarOwner(operand)) {
+ // Outer loop IVs could be used in forOp's bounds.
+ if (!addAffineForOpDomain(loop, constraints))
+ return false;
+ }
}
}
}
// Record positions of the operands in the constraint system.
SmallVector<unsigned, 8> positions;
for (const auto &operand : operands) {
- unsigned loc;
- if (!constraints->findId(*operand, &loc))
+ unsigned pos;
+ if (!constraints->findId(*operand, &pos))
assert(0 && "expected to be found");
- positions.push_back(loc);
+ positions.push_back(pos);
}
auto boundMap =
@@ -1106,7 +1112,7 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
ineq[constraints->getNumCols() - 1] =
lower ? -flatExpr[flatExpr.size() - 1]
// Upper bound in flattenedExpr is an exclusive one.
- : flatExpr[flatExpr.size() - 1] - 1;
+ : flatExpr[flatExpr.size() - 1] - step;
constraints->addInequality(ineq);
}
return true;
@@ -1121,7 +1127,8 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
}
if (forOp->hasConstantUpperBound()) {
- constraints->addConstantUpperBound(pos, forOp->getConstantUpperBound() - 1);
+ constraints->addConstantUpperBound(pos,
+ forOp->getConstantUpperBound() - step);
return true;
}
// Non-constant upper bound case.
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index 821f19df31d..3fbea075386 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -28,6 +28,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/StandardOps/StandardOps.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -35,6 +36,8 @@
using namespace mlir;
+using llvm::SmallDenseMap;
+
/// Populates 'loops' with IVs of the loops surrounding 'inst' ordered from
/// the outermost 'for' instruction to the innermost one.
void mlir::getLoopIVs(const Instruction &inst,
@@ -133,6 +136,9 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth,
unsigned rank = access.getRank();
+ LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *inst
+ << "depth: " << loopDepth << "\n";);
+
if (rank == 0) {
SmallVector<OpPointer<AffineForOp>, 4> ivs;
getLoopIVs(*inst, &ivs);
@@ -607,36 +613,41 @@ unsigned mlir::getNumCommonSurroundingLoops(const Instruction &A,
return numCommonLoops;
}
-Optional<int64_t>
-mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
- int memorySpace) {
- return getMemoryFootprintBytes(*forOp->getBody(), memorySpace);
-}
+static Optional<int64_t> getMemoryFootprintBytes(const Block &block,
+ Block::const_iterator start,
+ Block::const_iterator end,
+ int memorySpace) {
+ SmallDenseMap<Value *, std::unique_ptr<MemRefRegion>, 4> regions;
-Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,
- int memorySpace) {
- std::vector<std::unique_ptr<MemRefRegion>> regions;
+ // Cast away constness since the walker uses non-const versions; but we
+ // guarantee that the visitor callback isn't mutating opInst.
+ auto *cStart = reinterpret_cast<Block::iterator *>(&start);
+ auto *cEnd = reinterpret_cast<Block::iterator *>(&end);
// Walk this 'for' instruction to gather all memory regions.
bool error = false;
- const_cast<Block *>(&block)->walk([&](Instruction *opInst) {
+ const_cast<Block *>(&block)->walk(*cStart, *cEnd, [&](Instruction *opInst) {
if (!opInst->isa<LoadOp>() && !opInst->isa<StoreOp>()) {
// Neither load nor a store op.
return;
}
- // TODO(bondhugula): eventually, we need to be performing a union across
- // all regions for a given memref instead of creating one region per
- // memory op. This way we would be allocating O(num of memref's) sets
- // instead of O(num of load/store op's).
+ // Compute the memref region symbolic in any IVs enclosing this block.
auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
- if (!region->compute(opInst, /*loopDepth=*/0)) {
- LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n");
- // TODO: stop the walk if an error occurred.
+ if (!region->compute(opInst,
+ /*loopDepth=*/getNestingDepth(*block.begin()))) {
+ opInst->emitError("Error obtaining memory region\n");
+ error = true;
+ return;
+ }
+ auto it = regions.find(region->memref);
+ if (it == regions.end()) {
+ regions[region->memref] = std::move(region);
+ } else if (!it->second->unionBoundingBox(*region)) {
+ opInst->emitError("Error performing a union on a memory region\n");
error = true;
return;
}
- regions.push_back(std::move(region));
});
if (error)
@@ -644,10 +655,19 @@ Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,
int64_t totalSizeInBytes = 0;
for (const auto &region : regions) {
- auto size = region->getRegionSize();
+ Optional<int64_t> size = region.second->getRegionSize();
if (!size.hasValue())
return None;
totalSizeInBytes += size.getValue();
}
return totalSizeInBytes;
}
+
+Optional<int64_t>
+mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
+ int memorySpace) {
+ auto *forInst = forOp->getInstruction();
+ return ::getMemoryFootprintBytes(
+ *forInst->getBlock(), Block::const_iterator(forInst),
+ std::next(Block::const_iterator(forInst)), memorySpace);
+}
diff --git a/mlir/lib/IR/AffineStructures.cpp b/mlir/lib/IR/AffineStructures.cpp
index 5179e935007..d043e78f059 100644
--- a/mlir/lib/IR/AffineStructures.cpp
+++ b/mlir/lib/IR/AffineStructures.cpp
@@ -2059,6 +2059,7 @@ bool FlatAffineConstraints::unionBoundingBox(
auto extent = getConstantBoundOnDimSize(d, &lb, &lbDivisor);
if (!extent.hasValue())
// TODO(bondhugula): symbolic extents when necessary.
+ // TODO(bondhugula): handle union if a dimension is unbounded.
return false;
otherLb.clear();
diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp
index 06aa3758712..abf693657be 100644
--- a/mlir/lib/Transforms/DmaGeneration.cpp
+++ b/mlir/lib/Transforms/DmaGeneration.cpp
@@ -49,14 +49,18 @@ static llvm::cl::opt<unsigned> clFastMemoryCapacity(
"Set fast memory space capacity in KiB (default: unlimited)"),
llvm::cl::cat(clOptionsCategory));
-static const unsigned kDefaultFastMemorySpace = 1;
-
static llvm::cl::opt<unsigned> clFastMemorySpace(
- "dma-fast-mem-space", llvm::cl::init(kDefaultFastMemorySpace),
+ "dma-fast-mem-space", llvm::cl::init(1),
llvm::cl::desc(
"Fast memory space identifier for DMA generation (default: 1)"),
llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
+ "dma-skip-non-unit-stride-loops", llvm::cl::Hidden, llvm::cl::init(false),
+ llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
+ "for DMA placement"),
+ llvm::cl::cat(clOptionsCategory));
+
namespace {
/// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
@@ -76,7 +80,7 @@ struct DmaGeneration : public FunctionPass {
fastMemCapacityBytes(fastMemCapacityBytes) {}
PassResult runOnFunction(Function *f) override;
- bool runOnBlock(Block *block, uint64_t consumedCapacityBytes);
+ bool runOnBlock(Block *block);
uint64_t runOnBlock(Block::iterator begin, Block::iterator end);
bool generateDma(const MemRefRegion &region, Block *block,
@@ -457,12 +461,10 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
/// `regions`; each region is either a sequence of one or more instructions
/// starting and ending with a load or store op, or just a loop (which could
/// have other loops nested within). Returns false on an error, true otherwise.
-bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
+bool DmaGeneration::runOnBlock(Block *block) {
if (block->empty())
return true;
- uint64_t priorConsumedCapacityBytes = consumedCapacityBytes;
-
// Every loop in the block starts and ends a region. A contiguous sequence of
// operation instructions starting and ending with a load/store op is also
// identified as a region. Straightline code (contiguous chunks of operation
@@ -482,35 +484,37 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
for (auto it = curBegin; it != block->end(); ++it) {
if (auto forOp = it->dyn_cast<AffineForOp>()) {
- // We'll assume for now that loops with steps are tiled loops, and so DMAs
- // are not performed for that depth, but only further inside.
- // If the memory footprint of the 'for' loop is higher than fast
- // memory capacity (when provided), we recurse to DMA at an inner level
- // until we find a depth at which footprint fits in the capacity. If the
- // footprint can't be calcuated, we assume for now it fits.
-
// Returns true if the footprint is known to exceed capacity.
auto exceedsCapacity = [&](OpPointer<AffineForOp> forOp) {
- Optional<int64_t> footprint;
- return ((footprint = getMemoryFootprintBytes(forOp, 0)).hasValue() &&
- consumedCapacityBytes +
- static_cast<uint64_t>(footprint.getValue()) >
+ Optional<int64_t> footprint =
+ getMemoryFootprintBytes(forOp,
+ /*memorySpace=*/0);
+ return (footprint.hasValue() &&
+ static_cast<uint64_t>(footprint.getValue()) >
fastMemCapacityBytes);
};
- if (forOp->getStep() != 1 || exceedsCapacity(forOp)) {
- // We'll split and do the DMAs one or more levels inside for forInst
- consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+ // If the memory footprint of the 'for' loop is higher than fast
+ // memory capacity (when provided), we recurse to DMA at an inner level
+ // until we find a depth at which footprint fits in fast mem capacity. If
+ // the footprint can't be calculated, we assume for now it fits. Recurse
+ // inside if footprint for 'forOp' exceeds capacity, or when
+ // clSkipNonUnitStrideLoop is set and the step size is not one.
+ bool recurseInner = clSkipNonUnitStrideLoop ? forOp->getStep() != 1
+ : exceedsCapacity(forOp);
+ if (recurseInner) {
+ // We'll recurse and do the DMAs at an inner level for 'forInst'.
+ runOnBlock(/*begin=*/curBegin, /*end=*/it);
// Recurse onto the body of this loop.
- runOnBlock(forOp->getBody(), consumedCapacityBytes);
+ runOnBlock(forOp->getBody());
// The next region starts right after the 'for' instruction.
curBegin = std::next(it);
} else {
// We have enough capacity, i.e., DMAs will be computed for the portion
- // of the block until 'it', and for the 'for' loop. For the
- // latter, they are placed just before this loop (for incoming DMAs) and
- // right after (for outgoing ones).
- consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+ // of the block until 'it', and for 'it', which is 'forOp'. Note that
+ // for the latter, the DMAs are placed just before this loop (for
+ // incoming DMAs) and right after (for outgoing ones).
+ runOnBlock(/*begin=*/curBegin, /*end=*/it);
// Inner loop DMAs have their own scope - we don't thus update consumed
// capacity. The footprint check above guarantees this inner loop's
@@ -519,7 +523,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
curBegin = std::next(it);
}
} else if (!it->isa<LoadOp>() && !it->isa<StoreOp>()) {
- consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+ runOnBlock(/*begin=*/curBegin, /*end=*/it);
curBegin = std::next(it);
}
}
@@ -528,29 +532,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
if (curBegin != block->end()) {
// Can't be a terminator because it would have been skipped above.
assert(!curBegin->isKnownTerminator() && "can't be a terminator");
- consumedCapacityBytes +=
- runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
- }
-
- if (llvm::DebugFlag) {
- uint64_t thisBlockDmaSizeBytes =
- consumedCapacityBytes - priorConsumedCapacityBytes;
- if (thisBlockDmaSizeBytes > 0) {
- emitNoteForBlock(
- *block,
- Twine(llvm::divideCeil(thisBlockDmaSizeBytes, 1024)) +
- " KiB of DMA buffers in fast memory space for this block\n");
- }
- }
-
- if (consumedCapacityBytes > fastMemCapacityBytes) {
- StringRef str = "Total size of all DMA buffers' for this block "
- "exceeds fast memory capacity\n";
- if (auto *inst = block->getContainingInst())
- inst->emitError(str);
- else
- block->getFunction()->emitError(str);
- return false;
+ runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
}
return true;
@@ -558,6 +540,9 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
/// Generates DMAs for a contiguous sequence of instructions in `block` in the
/// iterator range [begin, end). Returns the total size of the DMA buffers used.
+// Since we generate alloc's and dealloc's for all DMA buffers (before and
+// after the range of instructions resp), all of the fast memory capacity is
+// assumed to be available.
uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
if (begin == end)
return 0;
@@ -575,6 +560,9 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
writeRegions.clear();
fastBufferMap.clear();
+ // To check for errors when walking the block.
+ bool error = false;
+
// Walk this range of instructions to gather all memory regions.
block->walk(begin, end, [&](Instruction *opInst) {
// Gather regions to allocate to buffers in faster memory space.
@@ -598,6 +586,7 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
LLVM_DEBUG(
opInst->emitError("Non-constant memref sizes not yet supported"));
+ error = true;
return;
}
}
@@ -628,16 +617,25 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
LLVM_DEBUG(llvm::dbgs()
<< "Memory region bounding box failed; "
"over-approximating to the entire memref\n");
+ // If the union fails, we will overapproximate.
if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
LLVM_DEBUG(opInst->emitError(
"Non-constant memref sizes not yet supported"));
+ error = true;
+ return true;
}
+ it->second->getConstraints()->clearAndCopyFrom(
+ *region->getConstraints());
}
return true;
};
bool existsInRead = updateRegion(readRegions);
+ if (error)
+ return;
bool existsInWrite = updateRegion(writeRegions);
+ if (error)
+ return;
// Finally add it to the region list.
if (region->isWrite() && !existsInWrite) {
@@ -647,6 +645,12 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
}
});
+ if (error) {
+ begin->emitError(
+ "DMA generation failed for one or more memref's in this block\n");
+ return 0;
+ }
+
uint64_t totalDmaBuffersSizeInBytes = 0;
bool ret = true;
auto processRegions =
@@ -677,12 +681,22 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
// For a range of operation instructions, a note will be emitted at the
// caller.
OpPointer<AffineForOp> forOp;
+ uint64_t sizeInKib = llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024);
if (llvm::DebugFlag && (forOp = begin->dyn_cast<AffineForOp>())) {
forOp->emitNote(
- Twine(llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024)) +
+ Twine(sizeInKib) +
" KiB of DMA buffers in fast memory space for this block\n");
}
+ if (totalDmaBuffersSizeInBytes > fastMemCapacityBytes) {
+ StringRef str = "Total size of all DMA buffers' for this block "
+ "exceeds fast memory capacity\n";
+ if (auto *inst = block->getContainingInst())
+ inst->emitError(str);
+ else
+ block->getFunction()->emitError(str);
+ }
+
return totalDmaBuffersSizeInBytes;
}
@@ -691,7 +705,7 @@ PassResult DmaGeneration::runOnFunction(Function *f) {
zeroIndex = topBuilder.create<ConstantIndexOp>(f->getLoc(), 0);
for (auto &block : *f) {
- runOnBlock(&block, /*consumedCapacityBytes=*/0);
+ runOnBlock(&block);
}
// This function never leaves the IR in an invalid state.
return success();
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp
index b86516f31f7..44798dcee85 100644
--- a/mlir/lib/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Transforms/LoopTiling.cpp
@@ -37,7 +37,7 @@ using namespace mlir;
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
// List of tile sizes. If any of them aren't provided, they are filled with
-// clTileSize.
+// clTileSize / kDefaultTileSize.
static llvm::cl::list<unsigned> clTileSizes(
"tile-sizes",
llvm::cl::desc(
@@ -50,8 +50,8 @@ namespace {
struct LoopTiling : public FunctionPass {
LoopTiling() : FunctionPass(&LoopTiling::passID) {}
PassResult runOnFunction(Function *f) override;
- constexpr static unsigned kDefaultTileSize = 4;
+ constexpr static unsigned kDefaultTileSize = 4;
static char passID;
};
@@ -158,7 +158,8 @@ static void constructTiledIndexSetHyperRect(
UtilResult mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band,
ArrayRef<unsigned> tileSizes) {
assert(!band.empty());
- assert(band.size() == tileSizes.size());
+ assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes");
+
// Check if the supplied for inst's are all successively nested.
for (unsigned i = 1, e = band.size(); i < e; i++) {
assert(band[i]->getInstruction()->getParentInst() ==
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index a943ac280d1..a85f428bde6 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -261,7 +261,7 @@ PassResult
PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
auto mayBeConstTripCount = getConstantTripCount(forOp);
if (!mayBeConstTripCount.hasValue()) {
- LLVM_DEBUG(llvm::dbgs() << "unknown trip count loop\n");
+ LLVM_DEBUG(forOp->emitNote("unknown trip count loop"));
return success();
}
@@ -269,7 +269,7 @@ PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
findMatchingStartFinishInsts(forOp, startWaitPairs);
if (startWaitPairs.empty()) {
- LLVM_DEBUG(llvm::dbgs() << "No dma start/finish pairs\n";);
+ LLVM_DEBUG(forOp->emitNote("No dma start/finish pairs\n"));
return success();
}
OpenPOWER on IntegriCloud