diff options
| author | Uday Bondhugula <bondhugula@google.com> | 2018-10-12 14:54:54 -0700 |
|---|---|---|
| committer | jpienaar <jpienaar@google.com> | 2019-03-29 13:29:21 -0700 |
| commit | 86eac4618c06a54c1f6d95a8c9d94b15dda5e35b (patch) | |
| tree | a6574eb104b867e24814319c44b26d953527f4c2 /mlir/lib/Transforms/PipelineDataTransfer.cpp | |
| parent | 9e3b928e32285bf47d366d5685d2c65e616544cb (diff) | |
| download | bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.tar.gz bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.zip | |
Create private exclusive / single use affine computation slice for an op stmt.
- add util to create a private / exclusive / single use affine
computation slice for an op stmt (see method doc comment); a single
multi-result affine_apply op is prepended to the op stmt to provide all
results needed for its operands as a function of loop iterators and symbols.
- use it for DMA pipelining (to create private slices for DMA start stmt's);
resolve TODOs/feature request (b/117159533)
- move createComposedAffineApplyOp to Transforms/Utils; free it from taking a
memref as input / generalize it.
PiperOrigin-RevId: 216926818
Diffstat (limited to 'mlir/lib/Transforms/PipelineDataTransfer.cpp')
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 96 |
1 files changed, 31 insertions, 65 deletions
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index dd8b9a7615c..bb60d8e9d78 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -21,7 +21,7 @@ #include "mlir/Transforms/Passes.h" -#include "mlir/IR/AffineExpr.h" +#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" @@ -94,8 +94,7 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { auto *newMemRefType = doubleShape(cast<MemRefType>(oldMemRef->getType())); // Create and place the alloc at the top level. - auto *func = forStmt->getFunction(); - MLFuncBuilder topBuilder(func, func->begin()); + MLFuncBuilder topBuilder(forStmt->getFunction()); auto *newMemRef = cast<MLValue>( topBuilder.create<AllocOp>(forStmt->getLoc(), newMemRefType) ->getResult()); @@ -105,13 +104,9 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { bInner.getAffineMap(/*dimCount=*/1, /*symbolCount=*/0, {d0 % 2}, {}); auto ivModTwoOp = bInner.create<AffineApplyOp>(forStmt->getLoc(), modTwoMap, forStmt); - if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, ivModTwoOp->getResult(0))) + if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, + cast<MLValue>(ivModTwoOp->getResult(0)))) return false; - // We don't need ivMod2Op any more - this is cloned by - // replaceAllMemRefUsesWith wherever the memref replacement happens. Once - // b/117159533 is addressed, we'll eventually only need to pass - // ivModTwoOp->getResult(0) to replaceAllMemRefUsesWith. - cast<OperationStmt>(ivModTwoOp->getOperation())->eraseFromBlock(); return true; } @@ -169,16 +164,18 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { for (auto *dmaStartStmt : dmaStartStmts) { MLValue *oldMemRef = cast<MLValue>(dmaStartStmt->getOperand( getHigherMemRefPos(dmaStartStmt->getAs<DmaStartOp>()))); - if (!doubleBuffer(oldMemRef, forStmt)) + if (!doubleBuffer(oldMemRef, forStmt)) { return PassResult::Failure; + } } // Double the buffers for tag memref's. for (auto *dmaFinishStmt : dmaFinishStmts) { MLValue *oldTagMemRef = cast<MLValue>( dmaFinishStmt->getOperand(getTagMemRefPos(*dmaFinishStmt))); - if (!doubleBuffer(oldTagMemRef, forStmt)) + if (!doubleBuffer(oldTagMemRef, forStmt)) { return PassResult::Failure; + } } // Collect all compute ops. @@ -186,75 +183,43 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { computeOps.reserve(forStmt->getStatements().size()); // Store delay for statement for later lookup for AffineApplyOp's. DenseMap<const Statement *, unsigned> opDelayMap; - for (const auto &stmt : *forStmt) { + for (auto &stmt : *forStmt) { auto *opStmt = dyn_cast<OperationStmt>(&stmt); if (!opStmt) { // All for and if stmt's are treated as pure compute operations. - // TODO(bondhugula): check whether such statements do not have any DMAs - // nested within. opDelayMap[&stmt] = 1; } else if (opStmt->is<DmaStartOp>()) { // DMA starts are not shifted. - opDelayMap[&stmt] = 0; + opDelayMap[opStmt] = 0; + // Set shifts for DMA start stmt's affine operand computation slices to 0. + if (auto *slice = mlir::createAffineComputationSlice(opStmt)) { + opDelayMap[slice] = 0; + } else { + // If a slice wasn't created, the reachable affine_apply op's from its + // operands are the ones that go with it. + SmallVector<OperationStmt *, 4> affineApplyStmts; + SmallVector<MLValue *, 4> operands(opStmt->getOperands()); + getReachableAffineApplyOps(operands, affineApplyStmts); + for (auto *op : affineApplyStmts) { + opDelayMap[op] = 0; + } + } } else if (opStmt->is<DmaWaitOp>()) { // DMA finish op shifted by one. - opDelayMap[&stmt] = 1; - } else if (!opStmt->is<AffineApplyOp>()) { - // Compute op shifted by one. - opDelayMap[&stmt] = 1; + opDelayMap[opStmt] = 1; + } else { + // Everything else is a compute op; so shifted by one (op's supplying + // 'affine' operands to DMA start's have already been set right shifts. + opDelayMap[opStmt] = 1; computeOps.push_back(&stmt); } - // Shifts for affine apply op's determined later. - } - - // Get the ancestor of a 'stmt' that lies in forStmt's block. - auto getAncestorInForBlock = - [&](const Statement *stmt, const StmtBlock &block) -> const Statement * { - // Traverse up the statement hierarchy starting from the owner of operand to - // find the ancestor statement that resides in the block of 'forStmt'. - while (stmt != nullptr && stmt->getBlock() != &block) { - stmt = stmt->getParentStmt(); - } - return stmt; - }; - - // Determine delays for affine apply op's: look up delay from its consumer op. - // This code will be thrown away once we have a way to obtain indices through - // a composed affine_apply op. See TODO(b/117159533). Such a composed - // affine_apply will be used exclusively by a given memref deferencing op. - for (const auto &stmt : *forStmt) { - auto *opStmt = dyn_cast<OperationStmt>(&stmt); - // Skip statements that aren't affine apply ops. - if (!opStmt || !opStmt->is<AffineApplyOp>()) - continue; - // Traverse uses of each result of the affine apply op. - for (auto *res : opStmt->getResults()) { - for (auto &use : res->getUses()) { - auto *ancestorInForBlock = - getAncestorInForBlock(use.getOwner(), *forStmt); - assert(ancestorInForBlock && - "traversing parent should reach forStmt block"); - auto *opCheck = dyn_cast<OperationStmt>(ancestorInForBlock); - if (!opCheck || opCheck->is<AffineApplyOp>()) - continue; - assert(opDelayMap.find(ancestorInForBlock) != opDelayMap.end()); - if (opDelayMap.find(&stmt) != opDelayMap.end()) { - // This is where we enforce all uses of this affine_apply to have - // the same shifts - so that we know what shift to use for the - // affine_apply to preserve semantics. - assert(opDelayMap[&stmt] == opDelayMap[ancestorInForBlock]); - } else { - // Obtain delay from its consumer. - opDelayMap[&stmt] = opDelayMap[ancestorInForBlock]; - } - } - } } // Get delays stored in map. std::vector<uint64_t> delays(forStmt->getStatements().size()); unsigned s = 0; for (const auto &stmt : *forStmt) { + assert(opDelayMap.find(&stmt) != opDelayMap.end()); delays[s++] = opDelayMap[&stmt]; } @@ -263,8 +228,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { return PassResult::Failure; } - if (stmtBodySkew(forStmt, delays)) + if (stmtBodySkew(forStmt, delays)) { return PassResult::Failure; + } return PassResult::Success; } |

