diff options
| author | Uday Bondhugula <bondhugula@google.com> | 2018-10-12 14:54:54 -0700 |
|---|---|---|
| committer | jpienaar <jpienaar@google.com> | 2019-03-29 13:29:21 -0700 |
| commit | 86eac4618c06a54c1f6d95a8c9d94b15dda5e35b (patch) | |
| tree | a6574eb104b867e24814319c44b26d953527f4c2 /mlir | |
| parent | 9e3b928e32285bf47d366d5685d2c65e616544cb (diff) | |
| download | bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.tar.gz bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.zip | |
Create private exclusive / single use affine computation slice for an op stmt.
- add util to create a private / exclusive / single use affine
computation slice for an op stmt (see method doc comment); a single
multi-result affine_apply op is prepended to the op stmt to provide all
results needed for its operands as a function of loop iterators and symbols.
- use it for DMA pipelining (to create private slices for DMA start stmt's);
resolve TODOs/feature request (b/117159533)
- move createComposedAffineApplyOp to Transforms/Utils; free it from taking a
memref as input / generalize it.
PiperOrigin-RevId: 216926818
Diffstat (limited to 'mlir')
| -rw-r--r-- | mlir/include/mlir/Analysis/AffineAnalysis.h | 5 | ||||
| -rw-r--r-- | mlir/include/mlir/Transforms/Utils.h | 47 | ||||
| -rw-r--r-- | mlir/lib/Analysis/AffineAnalysis.cpp | 6 | ||||
| -rw-r--r-- | mlir/lib/Transforms/ComposeAffineMaps.cpp | 53 | ||||
| -rw-r--r-- | mlir/lib/Transforms/PipelineDataTransfer.cpp | 96 | ||||
| -rw-r--r-- | mlir/lib/Transforms/Utils.cpp | 147 | ||||
| -rw-r--r-- | mlir/test/Transforms/pipeline-data-transfer.mlir | 46 |
7 files changed, 248 insertions, 152 deletions
diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h index 9394b8de23e..95cb61e16b7 100644 --- a/mlir/include/mlir/Analysis/AffineAnalysis.h +++ b/mlir/include/mlir/Analysis/AffineAnalysis.h @@ -24,6 +24,7 @@ #ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H #define MLIR_ANALYSIS_AFFINE_ANALYSIS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -45,8 +46,8 @@ AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims, /// 'affineApplyOps', which are reachable via a search starting from 'operands', /// and ending at operands which are not defined by AffineApplyOps. void getReachableAffineApplyOps( - const llvm::SmallVector<MLValue *, 4> &operands, - llvm::SmallVector<OperationStmt *, 4> *affineApplyOps); + llvm::ArrayRef<MLValue *> operands, + llvm::SmallVectorImpl<OperationStmt *> &affineApplyOps); } // end namespace mlir diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h index e36e1adc390..bf3367cc179 100644 --- a/mlir/include/mlir/Transforms/Utils.h +++ b/mlir/include/mlir/Transforms/Utils.h @@ -30,7 +30,10 @@ namespace mlir { +class Location; +class MLFuncBuilder; class MLValue; +class OperationStmt; class SSAValue; /// Replace all uses of oldMemRef with newMemRef while optionally remapping the @@ -42,8 +45,50 @@ class SSAValue; // TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily // extended to add additional indices at any position. bool replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, - llvm::ArrayRef<SSAValue *> extraIndices, + llvm::ArrayRef<MLValue *> extraIndices, AffineMap indexRemap = AffineMap::Invalid()); + +/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of +/// its results equal to the number of operands, as a composition +/// of all other AffineApplyOps reachable from input parameter 'operands'. If +/// different operands were drawing results from multiple affine apply ops, +/// these will also be collected into a single (multi-result) affine apply op. +/// The final results of the composed AffineApplyOp are returned in output +/// parameter 'results'. Returns the affine apply op created. +OperationStmt * +createComposedAffineApplyOp(MLFuncBuilder *builder, Location *loc, + ArrayRef<MLValue *> operands, + ArrayRef<OperationStmt *> affineApplyOps, + SmallVectorImpl<SSAValue *> &results); + +/// Given an operation statement, inserts a new single affine apply operation, +/// that is exclusively used by this operation statement, and that provides all +/// operands that are results of an affine_apply as a function of loop iterators +/// and program parameters and whose results are. +/// +/// Before +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// send %A[%idx], ... +/// %v = "compute"(%idx, ...) +/// +/// After +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// send %A[%idx], ... +/// %idx_ = affine_apply (d0) -> (d0 mod 2) (%i) +/// %v = "compute"(%idx_, ...) + +/// This allows the application of different transformations on send and +/// compute (for eg. / different shifts/delays) +/// +/// Returns nullptr if none of the operands were the result of an affine_apply +/// and thus there was no affine computation slice to create. Returns the newly +/// affine_apply operation statement otherwise. +OperationStmt *createAffineComputationSlice(OperationStmt *opStmt); + } // end namespace mlir #endif // MLIR_TRANSFORMS_UTILS_H diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index f332836180a..ee1641b575e 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -304,8 +304,8 @@ AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims, // TODO(andydavis) Add a method to AffineApplyOp which forward substitutes // the AffineApplyOp into any user AffineApplyOps. void mlir::getReachableAffineApplyOps( - const SmallVector<MLValue *, 4> &operands, - SmallVector<OperationStmt *, 4> *affineApplyOps) { + ArrayRef<MLValue *> operands, + SmallVectorImpl<OperationStmt *> &affineApplyOps) { struct State { // The ssa value for this node in the DFS traversal. MLValue *value; @@ -329,7 +329,7 @@ void mlir::getReachableAffineApplyOps( if (auto affineApplyOp = opStmt->getAs<AffineApplyOp>()) { if (state.operandIndex == 0) { // Pre-Visit: Add 'opStmt' to reachable sequence. - affineApplyOps->push_back(opStmt); + affineApplyOps.push_back(opStmt); } if (state.operandIndex < opStmt->getNumOperands()) { // Visit: Add next 'affineApplyOp' operand to worklist. diff --git a/mlir/lib/Transforms/ComposeAffineMaps.cpp b/mlir/lib/Transforms/ComposeAffineMaps.cpp index 0aa593202e1..9e44dab1bff 100644 --- a/mlir/lib/Transforms/ComposeAffineMaps.cpp +++ b/mlir/lib/Transforms/ComposeAffineMaps.cpp @@ -66,6 +66,7 @@ #include "mlir/StandardOps/StandardOps.h" #include "mlir/Transforms/Pass.h" #include "mlir/Transforms/Passes.h" +#include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" using namespace mlir; @@ -90,46 +91,6 @@ MLFunctionPass *mlir::createComposeAffineMapsPass() { return new ComposeAffineMaps(); } -// Creates and inserts into 'builder' a new AffineApplyOp with the number of -// results equal to the rank of 'memrefType'. The AffineApplyOp is composed -// with all other AffineApplyOps reachable from input paramter 'operands'. -// The final results of the composed AffineApplyOp are returned in output -// paramter 'results'. -static void createComposedAffineApplyOp( - MLFuncBuilder *builder, Location *loc, MemRefType *memrefType, - const SmallVector<MLValue *, 4> &indices, - const SmallVector<OperationStmt *, 4> &affineApplyOps, - SmallVector<SSAValue *, 4> *results) { - // Get rank of memref type. - unsigned rank = memrefType->getRank(); - assert(indices.size() == rank); - // Create identity map with same number of dimensions as 'memrefType'. - auto map = builder->getMultiDimIdentityMap(rank); - // Initialize AffineValueMap with identity map. - AffineValueMap valueMap(map, indices); - - for (auto *opStmt : affineApplyOps) { - assert(opStmt->is<AffineApplyOp>()); - auto affineApplyOp = opStmt->getAs<AffineApplyOp>(); - // Forward substitute 'affineApplyOp' into 'valueMap'. - valueMap.forwardSubstitute(*affineApplyOp); - } - // Compose affine maps from all ancestor AffineApplyOps. - // Create new AffineApplyOp from 'valueMap'. - unsigned numOperands = valueMap.getNumOperands(); - SmallVector<SSAValue *, 4> operands(numOperands); - for (unsigned i = 0; i < numOperands; ++i) { - operands[i] = valueMap.getOperand(i); - } - // Create new AffineApplyOp based on 'valueMap'. - auto affineApplyOp = - builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), operands); - results->resize(rank); - for (unsigned i = 0; i < rank; ++i) { - (*results)[i] = affineApplyOp->getResult(i); - } -} - PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { // Gather all loads, stores and affine apply ops. struct OpGatherer : public StmtWalker<OpGatherer> { @@ -170,14 +131,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { // Gather sequnce of AffineApplyOps reachable from 'indices'. SmallVector<OperationStmt *, 4> affineApplyOps; - getReachableAffineApplyOps(indices, &affineApplyOps); + getReachableAffineApplyOps(indices, affineApplyOps); // Skip transforming 'loadOp' if there are no affine maps to compose. if (affineApplyOps.size() <= 1) continue; SmallVector<SSAValue *, 4> results; - createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices, - affineApplyOps, &results); + createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices, + affineApplyOps, results); // Create new LoadOp with new affine apply op. auto *newLoadResult = builder.create<LoadOp>(opStmt->getLoc(), loadOp->getMemRef(), results) @@ -203,14 +164,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) { } // Gather sequnce of AffineApplyOps reachable from 'indices'. SmallVector<OperationStmt *, 4> affineApplyOps; - getReachableAffineApplyOps(indices, &affineApplyOps); + getReachableAffineApplyOps(indices, affineApplyOps); // Skip transforming 'storeOp' if there are no affine maps to compose. if (affineApplyOps.size() <= 1) continue; SmallVector<SSAValue *, 4> results; - createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices, - affineApplyOps, &results); + createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices, + affineApplyOps, results); // Create new StoreOp with new affine apply op. builder.create<StoreOp>(opStmt->getLoc(), storeOp->getValueToStore(), storeOp->getMemRef(), results); diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index dd8b9a7615c..bb60d8e9d78 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -21,7 +21,7 @@ #include "mlir/Transforms/Passes.h" -#include "mlir/IR/AffineExpr.h" +#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" @@ -94,8 +94,7 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { auto *newMemRefType = doubleShape(cast<MemRefType>(oldMemRef->getType())); // Create and place the alloc at the top level. - auto *func = forStmt->getFunction(); - MLFuncBuilder topBuilder(func, func->begin()); + MLFuncBuilder topBuilder(forStmt->getFunction()); auto *newMemRef = cast<MLValue>( topBuilder.create<AllocOp>(forStmt->getLoc(), newMemRefType) ->getResult()); @@ -105,13 +104,9 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) { bInner.getAffineMap(/*dimCount=*/1, /*symbolCount=*/0, {d0 % 2}, {}); auto ivModTwoOp = bInner.create<AffineApplyOp>(forStmt->getLoc(), modTwoMap, forStmt); - if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, ivModTwoOp->getResult(0))) + if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, + cast<MLValue>(ivModTwoOp->getResult(0)))) return false; - // We don't need ivMod2Op any more - this is cloned by - // replaceAllMemRefUsesWith wherever the memref replacement happens. Once - // b/117159533 is addressed, we'll eventually only need to pass - // ivModTwoOp->getResult(0) to replaceAllMemRefUsesWith. - cast<OperationStmt>(ivModTwoOp->getOperation())->eraseFromBlock(); return true; } @@ -169,16 +164,18 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { for (auto *dmaStartStmt : dmaStartStmts) { MLValue *oldMemRef = cast<MLValue>(dmaStartStmt->getOperand( getHigherMemRefPos(dmaStartStmt->getAs<DmaStartOp>()))); - if (!doubleBuffer(oldMemRef, forStmt)) + if (!doubleBuffer(oldMemRef, forStmt)) { return PassResult::Failure; + } } // Double the buffers for tag memref's. for (auto *dmaFinishStmt : dmaFinishStmts) { MLValue *oldTagMemRef = cast<MLValue>( dmaFinishStmt->getOperand(getTagMemRefPos(*dmaFinishStmt))); - if (!doubleBuffer(oldTagMemRef, forStmt)) + if (!doubleBuffer(oldTagMemRef, forStmt)) { return PassResult::Failure; + } } // Collect all compute ops. @@ -186,75 +183,43 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { computeOps.reserve(forStmt->getStatements().size()); // Store delay for statement for later lookup for AffineApplyOp's. DenseMap<const Statement *, unsigned> opDelayMap; - for (const auto &stmt : *forStmt) { + for (auto &stmt : *forStmt) { auto *opStmt = dyn_cast<OperationStmt>(&stmt); if (!opStmt) { // All for and if stmt's are treated as pure compute operations. - // TODO(bondhugula): check whether such statements do not have any DMAs - // nested within. opDelayMap[&stmt] = 1; } else if (opStmt->is<DmaStartOp>()) { // DMA starts are not shifted. - opDelayMap[&stmt] = 0; + opDelayMap[opStmt] = 0; + // Set shifts for DMA start stmt's affine operand computation slices to 0. + if (auto *slice = mlir::createAffineComputationSlice(opStmt)) { + opDelayMap[slice] = 0; + } else { + // If a slice wasn't created, the reachable affine_apply op's from its + // operands are the ones that go with it. + SmallVector<OperationStmt *, 4> affineApplyStmts; + SmallVector<MLValue *, 4> operands(opStmt->getOperands()); + getReachableAffineApplyOps(operands, affineApplyStmts); + for (auto *op : affineApplyStmts) { + opDelayMap[op] = 0; + } + } } else if (opStmt->is<DmaWaitOp>()) { // DMA finish op shifted by one. - opDelayMap[&stmt] = 1; - } else if (!opStmt->is<AffineApplyOp>()) { - // Compute op shifted by one. - opDelayMap[&stmt] = 1; + opDelayMap[opStmt] = 1; + } else { + // Everything else is a compute op; so shifted by one (op's supplying + // 'affine' operands to DMA start's have already been set right shifts. + opDelayMap[opStmt] = 1; computeOps.push_back(&stmt); } - // Shifts for affine apply op's determined later. - } - - // Get the ancestor of a 'stmt' that lies in forStmt's block. - auto getAncestorInForBlock = - [&](const Statement *stmt, const StmtBlock &block) -> const Statement * { - // Traverse up the statement hierarchy starting from the owner of operand to - // find the ancestor statement that resides in the block of 'forStmt'. - while (stmt != nullptr && stmt->getBlock() != &block) { - stmt = stmt->getParentStmt(); - } - return stmt; - }; - - // Determine delays for affine apply op's: look up delay from its consumer op. - // This code will be thrown away once we have a way to obtain indices through - // a composed affine_apply op. See TODO(b/117159533). Such a composed - // affine_apply will be used exclusively by a given memref deferencing op. - for (const auto &stmt : *forStmt) { - auto *opStmt = dyn_cast<OperationStmt>(&stmt); - // Skip statements that aren't affine apply ops. - if (!opStmt || !opStmt->is<AffineApplyOp>()) - continue; - // Traverse uses of each result of the affine apply op. - for (auto *res : opStmt->getResults()) { - for (auto &use : res->getUses()) { - auto *ancestorInForBlock = - getAncestorInForBlock(use.getOwner(), *forStmt); - assert(ancestorInForBlock && - "traversing parent should reach forStmt block"); - auto *opCheck = dyn_cast<OperationStmt>(ancestorInForBlock); - if (!opCheck || opCheck->is<AffineApplyOp>()) - continue; - assert(opDelayMap.find(ancestorInForBlock) != opDelayMap.end()); - if (opDelayMap.find(&stmt) != opDelayMap.end()) { - // This is where we enforce all uses of this affine_apply to have - // the same shifts - so that we know what shift to use for the - // affine_apply to preserve semantics. - assert(opDelayMap[&stmt] == opDelayMap[ancestorInForBlock]); - } else { - // Obtain delay from its consumer. - opDelayMap[&stmt] = opDelayMap[ancestorInForBlock]; - } - } - } } // Get delays stored in map. std::vector<uint64_t> delays(forStmt->getStatements().size()); unsigned s = 0; for (const auto &stmt : *forStmt) { + assert(opDelayMap.find(&stmt) != opDelayMap.end()); delays[s++] = opDelayMap[&stmt]; } @@ -263,8 +228,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) { return PassResult::Failure; } - if (stmtBodySkew(forStmt, delays)) + if (stmtBodySkew(forStmt, delays)) { return PassResult::Failure; + } return PassResult::Success; } diff --git a/mlir/lib/Transforms/Utils.cpp b/mlir/lib/Transforms/Utils.cpp index 2e8f0d32736..62ef3ba225a 100644 --- a/mlir/lib/Transforms/Utils.cpp +++ b/mlir/lib/Transforms/Utils.cpp @@ -22,7 +22,8 @@ #include "mlir/Transforms/Utils.h" -#include "mlir/IR/AffineMap.h" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/AffineStructures.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" @@ -48,7 +49,7 @@ static bool isMemRefDereferencingOp(const Operation &op) { // TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily // extended to add additional indices at any position. bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, - ArrayRef<SSAValue *> extraIndices, + ArrayRef<MLValue *> extraIndices, AffineMap indexRemap) { unsigned newMemRefRank = cast<MemRefType>(newMemRef->getType())->getRank(); (void)newMemRefRank; // unused in opt mode @@ -101,24 +102,16 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, operands.push_back(newMemRef); MLFuncBuilder builder(opStmt); - // Normally, we could just use extraIndices as operands, but we will - // clone it so that each op gets its own "private" index. See b/117159533. for (auto *extraIndex : extraIndices) { - OperationStmt::OperandMapTy operandMap; // TODO(mlir-team): An operation/SSA value should provide a method to // return the position of an SSA result in its defining // operation. assert(extraIndex->getDefiningStmt()->getNumResults() == 1 && "single result op's expected to generate these indices"); - // TODO: actually check if this is a result of an affine_apply op. assert((cast<MLValue>(extraIndex)->isValidDim() || cast<MLValue>(extraIndex)->isValidSymbol()) && "invalid memory op index"); - auto *clonedExtraIndex = - cast<OperationStmt>( - builder.clone(*extraIndex->getDefiningStmt(), operandMap)) - ->getResult(0); - operands.push_back(cast<MLValue>(clonedExtraIndex)); + operands.push_back(cast<MLValue>(extraIndex)); } // Construct new indices. The indices of a memref come right after it, i.e., @@ -163,3 +156,135 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef, } return true; } + +// Creates and inserts into 'builder' a new AffineApplyOp, with the number of +// its results equal to the number of 'operands, as a composition +// of all other AffineApplyOps reachable from input parameter 'operands'. If the +// operands were drawing results from multiple affine apply ops, this also leads +// to a collapse into a single affine apply op. The final results of the +// composed AffineApplyOp are returned in output parameter 'results'. +OperationStmt * +mlir::createComposedAffineApplyOp(MLFuncBuilder *builder, Location *loc, + ArrayRef<MLValue *> operands, + ArrayRef<OperationStmt *> affineApplyOps, + SmallVectorImpl<SSAValue *> &results) { + // Create identity map with same number of dimensions as number of operands. + auto map = builder->getMultiDimIdentityMap(operands.size()); + // Initialize AffineValueMap with identity map. + AffineValueMap valueMap(map, operands); + + for (auto *opStmt : affineApplyOps) { + assert(opStmt->is<AffineApplyOp>()); + auto affineApplyOp = opStmt->getAs<AffineApplyOp>(); + // Forward substitute 'affineApplyOp' into 'valueMap'. + valueMap.forwardSubstitute(*affineApplyOp); + } + // Compose affine maps from all ancestor AffineApplyOps. + // Create new AffineApplyOp from 'valueMap'. + unsigned numOperands = valueMap.getNumOperands(); + SmallVector<SSAValue *, 4> outOperands(numOperands); + for (unsigned i = 0; i < numOperands; ++i) { + outOperands[i] = valueMap.getOperand(i); + } + // Create new AffineApplyOp based on 'valueMap'. + auto affineApplyOp = + builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), outOperands); + results.resize(operands.size()); + for (unsigned i = 0, e = operands.size(); i < e; ++i) { + results[i] = affineApplyOp->getResult(i); + } + return cast<OperationStmt>(affineApplyOp->getOperation()); +} + +/// Given an operation statement, inserts a new single affine apply operation, +/// that is exclusively used by this operation statement, and that provides all +/// operands that are results of an affine_apply as a function of loop iterators +/// and program parameters and whose results are. +/// +/// Before +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// "compute"(%idx) +/// +/// After +/// +/// for %i = 0 to #map(%N) +/// %idx = affine_apply (d0) -> (d0 mod 2) (%i) +/// "send"(%idx, %A, ...) +/// %idx_ = affine_apply (d0) -> (d0 mod 2) (%i) +/// "compute"(%idx_) +/// +/// This allows applying different transformations on send and compute (for eg. +/// different shifts/delays). +/// +/// Returns nullptr if none of the operands were the result of an affine_apply +/// and thus there was no affine computation slice to create. Returns the newly +/// affine_apply operation statement otherwise. +/// +/// +OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) { + // Collect all operands that are results of affine apply ops. + SmallVector<MLValue *, 4> subOperands; + subOperands.reserve(opStmt->getNumOperands()); + for (auto *operand : opStmt->getOperands()) { + auto *defStmt = operand->getDefiningStmt(); + if (defStmt && defStmt->is<AffineApplyOp>()) { + subOperands.push_back(operand); + } + } + + // Gather sequence of AffineApplyOps reachable from 'subOperands'. + SmallVector<OperationStmt *, 4> affineApplyOps; + getReachableAffineApplyOps(subOperands, affineApplyOps); + // Skip transforming if there are no affine maps to compose. + if (affineApplyOps.empty()) + return nullptr; + + // Check if all uses of the affine apply op's lie in this op stmt + // itself, in which case there would be nothing to do. + bool localized = true; + for (auto *op : affineApplyOps) { + for (auto *result : op->getResults()) { + for (auto &use : result->getUses()) { + if (use.getOwner() != opStmt) { + localized = false; + break; + } + } + } + } + if (localized) + return nullptr; + + MLFuncBuilder builder(opStmt); + SmallVector<SSAValue *, 4> results; + auto *affineApplyStmt = createComposedAffineApplyOp( + &builder, opStmt->getLoc(), subOperands, affineApplyOps, results); + assert(results.size() == subOperands.size() && + "number of results should be the same as the number of subOperands"); + + // Construct the new operands that include the results from the composed + // affine apply op above instead of existing ones (subOperands). So, they + // differ from opStmt's operands only for those operands in 'subOperands', for + // which they will be replaced by the corresponding one from 'results'. + SmallVector<MLValue *, 4> newOperands(opStmt->getOperands()); + for (unsigned i = 0, e = newOperands.size(); i < e; i++) { + // Replace the subOperands from among the new operands. + unsigned j, f; + for (j = 0, f = subOperands.size(); j < f; j++) { + if (newOperands[i] == subOperands[j]) + break; + } + if (j < subOperands.size()) { + newOperands[i] = cast<MLValue>(results[j]); + } + } + + for (unsigned idx = 0; idx < newOperands.size(); idx++) { + opStmt->setOperand(idx, newOperands[idx]); + } + + return affineApplyStmt; +} diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir index 13abd43a2dc..015c59b9944 100644 --- a/mlir/test/Transforms/pipeline-data-transfer.mlir +++ b/mlir/test/Transforms/pipeline-data-transfer.mlir @@ -1,6 +1,8 @@ // RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s -// CHECK: #map0 = (d0) -> (d0 mod 2) + +// CHECK: #map0 = (d0) -> (d0 mod 2, d0 mod 2) // CHECK-NEXT: #map1 = (d0) -> (d0 - 1) +// CHECK-NEXT: #map2 = (d0) -> (d0 mod 2) // CHECK-NEXT: mlfunc @loop_nest_dma() { // CHECK-NEXT: %c8 = constant 8 : index // CHECK-NEXT: %c0 = constant 0 : index @@ -12,34 +14,30 @@ // CHECK-NEXT: %c0_0 = constant 0 : index // CHECK-NEXT: %c128 = constant 128 : index // CHECK-NEXT: %5 = affine_apply #map0(%c0) -// CHECK-NEXT: %6 = affine_apply #map0(%c0) -// CHECK-NEXT: dma_start %2[%c0], %1[%5, %c0], %c128, %0[%6, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32> +// CHECK-NEXT: dma_start %2[%c0], %1[%5#0, %c0], %c128, %0[%5#1, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32> // CHECK-NEXT: for %i0 = 1 to 7 { -// CHECK-NEXT: %7 = affine_apply #map0(%i0) -// CHECK-NEXT: %8 = affine_apply #map0(%i0) -// CHECK-NEXT: dma_start %2[%i0], %1[%7, %i0], %c128, %0[%8, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32> -// CHECK-NEXT: %9 = affine_apply #map1(%i0) -// CHECK-NEXT: %10 = affine_apply #map0(%9) -// CHECK-NEXT: dma_wait %0[%10, %c0_0] : memref<2x1xf32> -// CHECK-NEXT: %11 = affine_apply #map0(%9) -// CHECK-NEXT: %12 = load %1[%11, %9] : memref<2x32xf32> -// CHECK-NEXT: %13 = "compute"(%12) : (f32) -> f32 -// CHECK-NEXT: %14 = affine_apply #map0(%9) -// CHECK-NEXT: store %13, %1[%14, %9] : memref<2x32xf32> +// CHECK-NEXT: %6 = affine_apply #map0(%i0) +// CHECK-NEXT: dma_start %2[%i0], %1[%6#0, %i0], %c128, %0[%6#1, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32> +// CHECK-NEXT: %7 = affine_apply #map1(%i0) +// CHECK-NEXT: %8 = affine_apply #map2(%7) +// CHECK-NEXT: %9 = affine_apply #map2(%7) +// CHECK-NEXT: dma_wait %0[%8, %c0_0] : memref<2x1xf32> +// CHECK-NEXT: %10 = load %1[%9, %7] : memref<2x32xf32> +// CHECK-NEXT: %11 = "compute"(%10) : (f32) -> f32 +// CHECK-NEXT: store %11, %1[%9, %7] : memref<2x32xf32> // CHECK-NEXT: for %i1 = 0 to 127 { -// CHECK-NEXT: "do_more_compute"(%9, %i1) : (index, index) -> () +// CHECK-NEXT: "do_more_compute"(%7, %i1) : (index, index) -> () // CHECK-NEXT: } // CHECK-NEXT: } -// CHECK-NEXT: %15 = affine_apply #map1(%c8) -// CHECK-NEXT: %16 = affine_apply #map0(%15) -// CHECK-NEXT: dma_wait %0[%16, %c0_0] : memref<2x1xf32> -// CHECK-NEXT: %17 = affine_apply #map0(%15) -// CHECK-NEXT: %18 = load %1[%17, %15] : memref<2x32xf32> -// CHECK-NEXT: %19 = "compute"(%18) : (f32) -> f32 -// CHECK-NEXT: %20 = affine_apply #map0(%15) -// CHECK-NEXT: store %19, %1[%20, %15] : memref<2x32xf32> +// CHECK-NEXT: %12 = affine_apply #map1(%c8) +// CHECK-NEXT: %13 = affine_apply #map2(%12) +// CHECK-NEXT: %14 = affine_apply #map2(%12) +// CHECK-NEXT: dma_wait %0[%13, %c0_0] : memref<2x1xf32> +// CHECK-NEXT: %15 = load %1[%14, %12] : memref<2x32xf32> +// CHECK-NEXT: %16 = "compute"(%15) : (f32) -> f32 +// CHECK-NEXT: store %16, %1[%14, %12] : memref<2x32xf32> // CHECK-NEXT: for %i2 = 0 to 127 { -// CHECK-NEXT: "do_more_compute"(%15, %i2) : (index, index) -> () +// CHECK-NEXT: "do_more_compute"(%12, %i2) : (index, index) -> () // CHECK-NEXT: } // CHECK-NEXT: return mlfunc @loop_nest_dma() { |

