diff options
| author | Uday Bondhugula <bondhugula@google.com> | 2018-10-12 14:54:54 -0700 |
|---|---|---|
| committer | jpienaar <jpienaar@google.com> | 2019-03-29 13:29:21 -0700 |
| commit | 86eac4618c06a54c1f6d95a8c9d94b15dda5e35b (patch) | |
| tree | a6574eb104b867e24814319c44b26d953527f4c2 /mlir/lib | |
| parent | 9e3b928e32285bf47d366d5685d2c65e616544cb (diff) | |
| download | bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.tar.gz bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.zip | |
Create private exclusive / single use affine computation slice for an op stmt.
- add util to create a private / exclusive / single use affine
computation slice for an op stmt (see method doc comment); a single
multi-result affine_apply op is prepended to the op stmt to provide all
results needed for its operands as a function of loop iterators and symbols.
- use it for DMA pipelining (to create private slices for DMA start stmt's);
resolve TODOs/feature request (b/117159533)
- move createComposedAffineApplyOp to Transforms/Utils; free it from taking a
memref as input / generalize it.
PiperOrigin-RevId: 216926818
Diffstat (limited to 'mlir/lib')
| -rw-r--r-- | mlir/lib/Analysis/AffineAnalysis.cpp | 6 | ||||
| -rw-r--r-- | mlir/lib/Transforms/ComposeAffineMaps.cpp | 53 | ||||
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 96 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Utils.cpp | 147 |
4 files changed, 177 insertions, 125 deletions
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index f332836180a..ee1641b575e 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -304,8 +304,8 @@ AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims, // TODO(andydavis) Add a method to AffineApplyOp which forward substitutes // the AffineApplyOp into any user AffineApplyOps. void mlir::getReachableAffineApplyOps( - const SmallVector<MLValue *, 4> &operands, - SmallVector<OperationStmt *, 4> *affineApplyOps) { + ArrayRef<MLValue *> operands, + SmallVectorImpl<OperationStmt *> &affineApplyOps) { struct State { // The ssa value for this node in the DFS traversal. MLValue *value; @@ -329,7 +329,7 @@ void mlir::getReachableAffineApplyOps( if (auto affineApplyOp = opStmt->getAs<AffineApplyOp>()) { if (state.operandIndex == 0) { // Pre-Visit: Add 'opStmt' to reachable sequence. - affineApplyOps->push_back(opStmt); + affineApplyOps.push_back(opStmt); } if (state.operandIndex < opStmt->getNumOperands()) { // Visit: Add next 'affineApplyOp' operand to worklist. diff --git a/mlir/lib/Transforms/ComposeAffineMaps.cpp b/mlir/lib/Transforms/ComposeAffineMaps.cpp index 0aa593202e1..9e44dab1bff 100644 --- a/mlir/lib/Transforms/ComposeAffineMaps.cpp +++ b/mlir/lib/Transforms/ComposeAffineMaps.cpp @@ -66,6 +66,7 @@ #include "mlir/StandardOps/StandardOps.h" #include "mlir/Transforms/Pass.h" #include "mlir/Transforms/Passes.h" +#include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" using namespace mlir; @@ -90,46 +91,6 @@ MLFunctionPass *mlir::createComposeAffineMapsPass() { return new ComposeAffineMaps(); } -// Creates and inserts into 'builder' a new AffineApplyOp with the number of -// results equal to the rank of 'memrefType'. The AffineApplyOp is composed -// with all other AffineApplyOps reachable from input paramter 'operands'. -// The final results of the composed AffineApplyOp are returned in output -// paramter 'results'. -static void createComposedAffineApplyOp( - MLFuncBuilder *builder, Location *loc, MemRefType *memrefType, - const SmallVector<MLValue *, 4> &indices, - const SmallVector<OperationStmt *, 4> &affineApplyOps, - SmallVector<SSAValue *, 4> *results) { - // Get rank of memref type. - unsigned rank = memrefType->getRank(); - assert(indices.size() == rank); - // Create identity map with same number of dimensions as 'memrefType'. - auto map = builder->getMultiDimIdentityMap(rank); - // Initialize AffineValueMap with identity map. - AffineValueMap valueMap(map, indices); - - for (auto *opStmt : affineApplyOps) { - assert(opStmt->is<AffineApplyOp>()); - auto affineApplyOp = opStmt->getAs<AffineApplyOp>(); - // Forward substitute 'affineApplyOp' into 'valueMap'. - valueMap.forwardSubstitute(*affineApplyOp); - } - // Compose affine maps from all ancestor AffineApplyOps. - // Create new AffineApplyOp from 'valueMap'. - unsigned numOperands = valueMap.getNumOperands(); - SmallVector<SSAValue *, 4> operands(numOperands); - for (unsigned i = 0; i < numOperands; ++i) { - operands[i] = valueMap.getOperand(i); - } - // Create new AffineApplyOp based on 'valueMap'. - auto affineApplyOp = - builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), operands); - results->resize(rank); - for (unsigned i = 0; i < rank; ++i) { - (*results)[i] = affineApplyOp->getResult(i); - } -} - PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { // Gather all loads, stores and affine apply ops. struct OpGatherer : public StmtWalker<OpGatherer> { @@ -170,14 +131,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { // Gather sequnce of AffineApplyOps reachable from 'indices'. SmallVector<OperationStmt *, 4> affineApplyOps; - getReachableAffineApplyOps(indices, &affineApplyOps); + getReachableAffineApplyOps(indices, affineApplyOps); // Skip transforming 'loadOp' if there are no affine maps to compose. if (affineApplyOps.size() <= 1) continue; SmallVector<SSAValue *, 4> results; - createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices, - affineApplyOps, &results); + createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices, + affineApplyOps, results); // Create new LoadOp with new affine apply op. auto *newLoadResult = builder.create<LoadOp>(opStmt->getLoc(), loadOp->getMemRef(), results) @@ -203,14 +164,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { } // Gather sequnce of AffineApplyOps reachable from 'indices'. SmallVector<OperationStmt *, 4> affineApplyOps; - getReachableAffineApplyOps(indices, &affineApplyOps); + getReachableAffineApplyOps(indices, affineApplyOps); // Skip transforming 'storeOp' if there are no affine maps to compose. if (affineApplyOps.size() <= 1) continue; SmallVector<SSAValue *, 4> results; - createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices, - affineApplyOps, &results); + createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices, + affineApplyOps, results); // Create new StoreOp with new affine apply op. builder.create<StoreOp>(opStmt->getLoc(), storeOp->getValueToStore(), storeOp->getMemRef(), results); diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index dd8b9a7615c..bb60d8e9d78 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -21,7 +21,7 @@ #include "mlir/Transforms/Passes.h" -#include "mlir/IR/AffineExpr.h" +#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" @@ -94,8 +94,7 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { auto *newMemRefType = doubleShape(cast<MemRefType>(oldMemRef->getType())); // Create and place the alloc at the top level. - auto *func = forStmt->getFunction(); - MLFuncBuilder topBuilder(func, func->begin()); + MLFuncBuilder topBuilder(forStmt->getFunction()); auto *newMemRef = cast<MLValue>( topBuilder.create<AllocOp>(forStmt->getLoc(), newMemRefType) ->getResult()); @@ -105,13 +104,9 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { bInner.getAffineMap(/*dimCount=*/1, /*symbolCount=*/0, {d0 % 2}, {}); auto ivModTwoOp = bInner.create<AffineApplyOp>(forStmt->getLoc(), modTwoMap, forStmt); - if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, ivModTwoOp->getResult(0))) + if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, + cast<MLValue>(ivModTwoOp->getResult(0)))) return false; - // We don't need ivMod2Op any more - this is cloned by - // replaceAllMemRefUsesWith wherever the memref replacement happens. Once - // b/117159533 is addressed, we'll eventually only need to pass - // ivModTwoOp->getResult(0) to replaceAllMemRefUsesWith. - cast<OperationStmt>(ivModTwoOp->getOperation())->eraseFromBlock(); return true; } @@ -169,16 +164,18 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { for (auto *dmaStartStmt : dmaStartStmts) { MLValue *oldMemRef = cast<MLValue>(dmaStartStmt->getOperand( getHigherMemRefPos(dmaStartStmt->getAs<DmaStartOp>()))); - if (!doubleBuffer(oldMemRef, forStmt)) + if (!doubleBuffer(oldMemRef, forStmt)) { return PassResult::Failure; + } } // Double the buffers for tag memref's. for (auto *dmaFinishStmt : dmaFinishStmts) { MLValue *oldTagMemRef = cast<MLValue>( dmaFinishStmt->getOperand(getTagMemRefPos(*dmaFinishStmt))); - if (!doubleBuffer(oldTagMemRef, forStmt)) + if (!doubleBuffer(oldTagMemRef, forStmt)) { return PassResult::Failure; + } } // Collect all compute ops. @@ -186,75 +183,43 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { computeOps.reserve(forStmt->getStatements().size()); // Store delay for statement for later lookup for AffineApplyOp's. DenseMap<const Statement *, unsigned> opDelayMap; - for (const auto &stmt : *forStmt) { + for (auto &stmt : *forStmt) { auto *opStmt = dyn_cast<OperationStmt>(&stmt); if (!opStmt) { // All for and if stmt's are treated as pure compute operations. - // TODO(bondhugula): check whether such statements do not have any DMAs - // nested within. opDelayMap[&stmt] = 1; } else if (opStmt->is<DmaStartOp>()) { // DMA starts are not shifted. - opDelayMap[&stmt] = 0; + opDelayMap[opStmt] = 0; + // Set shifts for DMA start stmt's affine operand computation slices to 0. + if (auto *slice = mlir::createAffineComputationSlice(opStmt)) { + opDelayMap[slice] = 0; + } else { + // If a slice wasn't created, the reachable affine_apply op's from its + // operands are the ones that go with it. + SmallVector<OperationStmt *, 4> affineApplyStmts; + SmallVector<MLValue *, 4> operands(opStmt->getOperands()); + getReachableAffineApplyOps(operands, affineApplyStmts); + for (auto *op : affineApplyStmts) { + opDelayMap[op] = 0; + } + } } else if (opStmt->is<DmaWaitOp>()) { // DMA finish op shifted by one. - opDelayMap[&stmt] = 1; - } else if (!opStmt->is<AffineApplyOp>()) { - // Compute op shifted by one. - opDelayMap[&stmt] = 1; + opDelayMap[opStmt] = 1; + } else { + // Everything else is a compute op; so shifted by one (op's supplying + // 'affine' operands to DMA start's have already been set right shifts. + opDelayMap[opStmt] = 1; computeOps.push_back(&stmt); } - // Shifts for affine apply op's determined later. - } - - // Get the ancestor of a 'stmt' that lies in forStmt's block. - auto getAncestorInForBlock = - [&](const Statement *stmt, const StmtBlock &block) -> const Statement * { - // Traverse up the statement hierarchy starting from the owner of operand to - // find the ancestor statement that resides in the block of 'forStmt'. - while (stmt != nullptr && stmt->getBlock() != &block) { - stmt = stmt->getParentStmt(); - } - return stmt; - }; - - // Determine delays for affine apply op's: look up delay from its consumer op. - // This code will be thrown away once we have a way to obtain indices through - // a composed affine_apply op. See TODO(b/117159533). Such a composed - // affine_apply will be used exclusively by a given memref deferencing op. - for (const auto &stmt : *forStmt) { - auto *opStmt = dyn_cast<OperationStmt>(&stmt); - // Skip statements that aren't affine apply ops. - if (!opStmt || !opStmt->is<AffineApplyOp>()) - continue; - // Traverse uses of each result of the affine apply op. - for (auto *res : opStmt->getResults()) { - for (auto &use : res->getUses()) { - auto *ancestorInForBlock = - getAncestorInForBlock(use.getOwner(), *forStmt); - assert(ancestorInForBlock && - "traversing parent should reach forStmt block"); - auto *opCheck = dyn_cast<OperationStmt>(ancestorInForBlock); - if (!opCheck || opCheck->is<AffineApplyOp>()) - continue; - assert(opDelayMap.find(ancestorInForBlock) != opDelayMap.end()); - if (opDelayMap.find(&stmt) != opDelayMap.end()) { - // This is where we enforce all uses of this affine_apply to have - // the same shifts - so that we know what shift to use for the - // affine_apply to preserve semantics. - assert(opDelayMap[&stmt] == opDelayMap[ancestorInForBlock]); - } else { - // Obtain delay from its consumer. - opDelayMap[&stmt] = opDelayMap[ancestorInForBlock]; - } - } - } } // Get delays stored in map. std::vector<uint64_t> delays(forStmt->getStatements().size()); unsigned s = 0; for (const auto &stmt : *forStmt) { + assert(opDelayMap.find(&stmt) != opDelayMap.end()); delays[s++] = opDelayMap[&stmt]; } @@ -263,8 +228,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { return PassResult::Failure; } - if (stmtBodySkew(forStmt, delays)) + if (stmtBodySkew(forStmt, delays)) { return PassResult::Failure; + } return PassResult::Success; } diff --git a/mlir/lib/Transforms/Utils.cpp b/mlir/lib/Transforms/Utils.cpp index 2e8f0d32736..62ef3ba225a 100644 --- a/mlir/lib/Transforms/Utils.cpp +++ b/mlir/lib/Transforms/Utils.cpp @@ -22,7 +22,8 @@ #include "mlir/Transforms/Utils.h" -#include "mlir/IR/AffineMap.h" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/AffineStructures.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" @@ -48,7 +49,7 @@ static bool isMemRefDereferencingOp(const Operation &op) { // TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily // extended to add additional indices at any position. bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, - ArrayRef<SSAValue *> extraIndices, + ArrayRef<MLValue *> extraIndices, AffineMap indexRemap) { unsigned newMemRefRank = cast<MemRefType>(newMemRef->getType())->getRank(); (void)newMemRefRank; // unused in opt mode @@ -101,24 +102,16 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, operands.push_back(newMemRef); MLFuncBuilder builder(opStmt); - // Normally, we could just use extraIndices as operands, but we will - // clone it so that each op gets its own "private" index. See b/117159533. for (auto *extraIndex : extraIndices) { - OperationStmt::OperandMapTy operandMap; // TODO(mlir-team): An operation/SSA value should provide a method to // return the position of an SSA result in its defining // operation. assert(extraIndex->getDefiningStmt()->getNumResults() == 1 && "single result op's expected to generate these indices"); - // TODO: actually check if this is a result of an affine_apply op. assert((cast<MLValue>(extraIndex)->isValidDim() || cast<MLValue>(extraIndex)->isValidSymbol()) && "invalid memory op index"); - auto *clonedExtraIndex = - cast<OperationStmt>( - builder.clone(*extraIndex->getDefiningStmt(), operandMap)) - ->getResult(0); - operands.push_back(cast<MLValue>(clonedExtraIndex)); + operands.push_back(cast<MLValue>(extraIndex)); } // Construct new indices. The indices of a memref come right after it, i.e., @@ -163,3 +156,135 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, } return true; } + +// Creates and inserts into 'builder' a new AffineApplyOp, with the number of +// its results equal to the number of 'operands, as a composition +// of all other AffineApplyOps reachable from input parameter 'operands'. If the +// operands were drawing results from multiple affine apply ops, this also leads +// to a collapse into a single affine apply op. The final results of the +// composed AffineApplyOp are returned in output parameter 'results'. +OperationStmt * +mlir::createComposedAffineApplyOp(MLFuncBuilder *builder, Location *loc, + ArrayRef<MLValue *> operands, + ArrayRef<OperationStmt *> affineApplyOps, + SmallVectorImpl<SSAValue *> &results) { + // Create identity map with same number of dimensions as number of operands. + auto map = builder->getMultiDimIdentityMap(operands.size()); + // Initialize AffineValueMap with identity map. + AffineValueMap valueMap(map, operands); + + for (auto *opStmt : affineApplyOps) { + assert(opStmt->is<AffineApplyOp>()); + auto affineApplyOp = opStmt->getAs<AffineApplyOp>(); + // Forward substitute 'affineApplyOp' into 'valueMap'. + valueMap.forwardSubstitute(*affineApplyOp); + } + // Compose affine maps from all ancestor AffineApplyOps. + // Create new AffineApplyOp from 'valueMap'. + unsigned numOperands = valueMap.getNumOperands(); + SmallVector<SSAValue *, 4> outOperands(numOperands); + for (unsigned i = 0; i < numOperands; ++i) { + outOperands[i] = valueMap.getOperand(i); + } + // Create new AffineApplyOp based on 'valueMap'. + auto affineApplyOp = + builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), outOperands); + results.resize(operands.size()); + for (unsigned i = 0, e = operands.size(); i < e; ++i) { + results[i] = affineApplyOp->getResult(i); + } + return cast<OperationStmt>(affineApplyOp->getOperation()); +} + +/// Given an operation statement, inserts a new single affine apply operation, +/// that is exclusively used by this operation statement, and that provides all +/// operands that are results of an affine_apply as a function of loop iterators +/// and program parameters and whose results are. +/// +/// Before +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// "compute"(%idx) +/// +/// After +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// %idx_ = affine_apply (d0) -> (d0 mod 2) (%i) +/// "compute"(%idx_) +/// +/// This allows applying different transformations on send and compute (for eg. +/// different shifts/delays). +/// +/// Returns nullptr if none of the operands were the result of an affine_apply +/// and thus there was no affine computation slice to create. Returns the newly +/// affine_apply operation statement otherwise. +/// +/// +OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) { + // Collect all operands that are results of affine apply ops. + SmallVector<MLValue *, 4> subOperands; + subOperands.reserve(opStmt->getNumOperands()); + for (auto *operand : opStmt->getOperands()) { + auto *defStmt = operand->getDefiningStmt(); + if (defStmt && defStmt->is<AffineApplyOp>()) { + subOperands.push_back(operand); + } + } + + // Gather sequence of AffineApplyOps reachable from 'subOperands'. + SmallVector<OperationStmt *, 4> affineApplyOps; + getReachableAffineApplyOps(subOperands, affineApplyOps); + // Skip transforming if there are no affine maps to compose. + if (affineApplyOps.empty()) + return nullptr; + + // Check if all uses of the affine apply op's lie in this op stmt + // itself, in which case there would be nothing to do. + bool localized = true; + for (auto *op : affineApplyOps) { + for (auto *result : op->getResults()) { + for (auto &use : result->getUses()) { + if (use.getOwner() != opStmt) { + localized = false; + break; + } + } + } + } + if (localized) + return nullptr; + + MLFuncBuilder builder(opStmt); + SmallVector<SSAValue *, 4> results; + auto *affineApplyStmt = createComposedAffineApplyOp( + &builder, opStmt->getLoc(), subOperands, affineApplyOps, results); + assert(results.size() == subOperands.size() && + "number of results should be the same as the number of subOperands"); + + // Construct the new operands that include the results from the composed + // affine apply op above instead of existing ones (subOperands). So, they + // differ from opStmt's operands only for those operands in 'subOperands', for + // which they will be replaced by the corresponding one from 'results'. + SmallVector<MLValue *, 4> newOperands(opStmt->getOperands()); + for (unsigned i = 0, e = newOperands.size(); i < e; i++) { + // Replace the subOperands from among the new operands. + unsigned j, f; + for (j = 0, f = subOperands.size(); j < f; j++) { + if (newOperands[i] == subOperands[j]) + break; + } + if (j < subOperands.size()) { + newOperands[i] = cast<MLValue>(results[j]); + } + } + + for (unsigned idx = 0; idx < newOperands.size(); idx++) { + opStmt->setOperand(idx, newOperands[idx]); + } + + return affineApplyStmt; +} |

