Create private exclusive / single use affine computation slice for an op stmt.

- add util to create a private / exclusive / single use affine computation slice for an op stmt (see method doc comment); a single multi-result affine_apply op is prepended to the op stmt to provide all results needed for its operands as a function of loop iterators and symbols. - use it for DMA pipelining (to create private slices for DMA start stmt's); resolve TODOs/feature request (b/117159533) - move createComposedAffineApplyOp to Transforms/Utils; free it from taking a memref as input / generalize it. PiperOrigin-RevId: 216926818
author: Uday Bondhugula <bondhugula@google.com> 2018-10-12 14:54:54 -0700
committer: jpienaar <jpienaar@google.com> 2019-03-29 13:29:21 -0700
commit: 86eac4618c06a54c1f6d95a8c9d94b15dda5e35b (patch)
tree: a6574eb104b867e24814319c44b26d953527f4c2 /mlir
parent: 9e3b928e32285bf47d366d5685d2c65e616544cb (diff)
download: bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.tar.gz
bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.zip
7 files changed, 248 insertions, 152 deletions
diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h
index 9394b8de23e..95cb61e16b7 100644
--- a/mlir/include/mlir/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -24,6 +24,7 @@
 #ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H
 #define MLIR_ANALYSIS_AFFINE_ANALYSIS_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 
@@ -45,8 +46,8 @@ AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims,
 /// 'affineApplyOps', which are reachable via a search starting from 'operands',
 /// and ending at operands which are not defined by AffineApplyOps.
 void getReachableAffineApplyOps(
-    const llvm::SmallVector<MLValue *, 4> &operands,
-    llvm::SmallVector<OperationStmt *, 4> *affineApplyOps);
+    llvm::ArrayRef<MLValue *> operands,
+    llvm::SmallVectorImpl<OperationStmt *> &affineApplyOps);
 
 } // end namespace mlir
 
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
index e36e1adc390..bf3367cc179 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ b/mlir/include/mlir/Transforms/Utils.h
@@ -30,7 +30,10 @@
 
 namespace mlir {
 
+class Location;
+class MLFuncBuilder;
 class MLValue;
+class OperationStmt;
 class SSAValue;
 
 /// Replace all uses of oldMemRef with newMemRef while optionally remapping the
@@ -42,8 +45,50 @@ class SSAValue;
 // TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
 // extended to add additional indices at any position.
 bool replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef,
-                              llvm::ArrayRef<SSAValue *> extraIndices,
+                              llvm::ArrayRef<MLValue *> extraIndices,
                               AffineMap indexRemap = AffineMap::Invalid());
+
+/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+/// its results equal to the number of operands, as a composition
+/// of all other AffineApplyOps reachable from input parameter 'operands'. If
+/// different operands were drawing results from multiple affine apply ops,
+/// these will also be collected into a single (multi-result) affine apply op.
+/// The final results of the composed AffineApplyOp are returned in output
+/// parameter 'results'. Returns the affine apply op created.
+OperationStmt *
+createComposedAffineApplyOp(MLFuncBuilder *builder, Location *loc,
+                            ArrayRef<MLValue *> operands,
+                            ArrayRef<OperationStmt *> affineApplyOps,
+                            SmallVectorImpl<SSAValue *> &results);
+
+/// Given an operation statement, inserts a new single affine apply operation,
+/// that is exclusively used by this operation statement, and that provides all
+/// operands that are results of an affine_apply as a function of loop iterators
+/// and program parameters and whose results are.
+///
+/// Before
+///
+/// for %i = 0 to #map(%N)
+///   %idx = affine_apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %v = "compute"(%idx, ...)
+///
+/// After
+///
+/// for %i = 0 to #map(%N)
+///   %idx = affine_apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %idx_ = affine_apply (d0) -> (d0 mod 2) (%i)
+///   %v = "compute"(%idx_, ...)
+
+/// This allows the application of  different transformations on send and
+/// compute (for eg.  / different shifts/delays)
+///
+/// Returns nullptr if none of the operands were the result of an affine_apply
+/// and thus there was no affine computation slice to create. Returns the newly
+/// affine_apply operation statement otherwise.
+OperationStmt *createAffineComputationSlice(OperationStmt *opStmt);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_UTILS_H
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index f332836180a..ee1641b575e 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -304,8 +304,8 @@ AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims,
 // TODO(andydavis) Add a method to AffineApplyOp which forward substitutes
 // the AffineApplyOp into any user AffineApplyOps.
 void mlir::getReachableAffineApplyOps(
-    const SmallVector<MLValue *, 4> &operands,
-    SmallVector<OperationStmt *, 4> *affineApplyOps) {
+    ArrayRef<MLValue *> operands,
+    SmallVectorImpl<OperationStmt *> &affineApplyOps) {
   struct State {
     // The ssa value for this node in the DFS traversal.
     MLValue *value;
@@ -329,7 +329,7 @@ void mlir::getReachableAffineApplyOps(
     if (auto affineApplyOp = opStmt->getAs<AffineApplyOp>()) {
       if (state.operandIndex == 0) {
         // Pre-Visit: Add 'opStmt' to reachable sequence.
-        affineApplyOps->push_back(opStmt);
+        affineApplyOps.push_back(opStmt);
       }
       if (state.operandIndex < opStmt->getNumOperands()) {
         // Visit: Add next 'affineApplyOp' operand to worklist.
diff --git a/mlir/lib/Transforms/ComposeAffineMaps.cpp b/mlir/lib/Transforms/ComposeAffineMaps.cpp
index 0aa593202e1..9e44dab1bff 100644
--- a/mlir/lib/Transforms/ComposeAffineMaps.cpp
+++ b/mlir/lib/Transforms/ComposeAffineMaps.cpp
@@ -66,6 +66,7 @@
 #include "mlir/StandardOps/StandardOps.h"
 #include "mlir/Transforms/Pass.h"
 #include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace mlir;
@@ -90,46 +91,6 @@ MLFunctionPass *mlir::createComposeAffineMapsPass() {
   return new ComposeAffineMaps();
 }
 
-// Creates and inserts into 'builder' a new AffineApplyOp with the number of
-// results equal to the rank of 'memrefType'. The AffineApplyOp is composed
-// with all other AffineApplyOps reachable from input paramter 'operands'.
-// The final results of the composed AffineApplyOp are returned in output
-// paramter 'results'.
-static void createComposedAffineApplyOp(
-    MLFuncBuilder *builder, Location *loc, MemRefType *memrefType,
-    const SmallVector<MLValue *, 4> &indices,
-    const SmallVector<OperationStmt *, 4> &affineApplyOps,
-    SmallVector<SSAValue *, 4> *results) {
-  // Get rank of memref type.
-  unsigned rank = memrefType->getRank();
-  assert(indices.size() == rank);
-  // Create identity map with same number of dimensions as 'memrefType'.
-  auto map = builder->getMultiDimIdentityMap(rank);
-  // Initialize AffineValueMap with identity map.
-  AffineValueMap valueMap(map, indices);
-
-  for (auto *opStmt : affineApplyOps) {
-    assert(opStmt->is<AffineApplyOp>());
-    auto affineApplyOp = opStmt->getAs<AffineApplyOp>();
-    // Forward substitute 'affineApplyOp' into 'valueMap'.
-    valueMap.forwardSubstitute(*affineApplyOp);
-  }
-  // Compose affine maps from all ancestor AffineApplyOps.
-  // Create new AffineApplyOp from 'valueMap'.
-  unsigned numOperands = valueMap.getNumOperands();
-  SmallVector<SSAValue *, 4> operands(numOperands);
-  for (unsigned i = 0; i < numOperands; ++i) {
-    operands[i] = valueMap.getOperand(i);
-  }
-  // Create new AffineApplyOp based on 'valueMap'.
-  auto affineApplyOp =
-      builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), operands);
-  results->resize(rank);
-  for (unsigned i = 0; i < rank; ++i) {
-    (*results)[i] = affineApplyOp->getResult(i);
-  }
-}
-
 PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) {
   // Gather all loads, stores and affine apply ops.
   struct OpGatherer : public StmtWalker<OpGatherer> {
@@ -170,14 +131,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) {
 
     // Gather sequnce of AffineApplyOps reachable from 'indices'.
     SmallVector<OperationStmt *, 4> affineApplyOps;
-    getReachableAffineApplyOps(indices, &affineApplyOps);
+    getReachableAffineApplyOps(indices, affineApplyOps);
     // Skip transforming 'loadOp' if there are no affine maps to compose.
     if (affineApplyOps.size() <= 1)
       continue;
 
     SmallVector<SSAValue *, 4> results;
-    createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices,
-                                affineApplyOps, &results);
+    createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices,
+                                affineApplyOps, results);
     // Create new LoadOp with new affine apply op.
     auto *newLoadResult =
         builder.create<LoadOp>(opStmt->getLoc(), loadOp->getMemRef(), results)
@@ -203,14 +164,14 @@ PassResult ComposeAffineMaps::runOnMLFunction(MLFunction *f) {
     }
     // Gather sequnce of AffineApplyOps reachable from 'indices'.
     SmallVector<OperationStmt *, 4> affineApplyOps;
-    getReachableAffineApplyOps(indices, &affineApplyOps);
+    getReachableAffineApplyOps(indices, affineApplyOps);
     // Skip transforming 'storeOp' if there are no affine maps to compose.
     if (affineApplyOps.size() <= 1)
       continue;
 
     SmallVector<SSAValue *, 4> results;
-    createComposedAffineApplyOp(&builder, opStmt->getLoc(), memrefType, indices,
-                                affineApplyOps, &results);
+    createComposedAffineApplyOp(&builder, opStmt->getLoc(), indices,
+                                affineApplyOps, results);
     // Create new StoreOp with new affine apply op.
     builder.create<StoreOp>(opStmt->getLoc(), storeOp->getValueToStore(),
                             storeOp->getMemRef(), results);
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index dd8b9a7615c..bb60d8e9d78 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -21,7 +21,7 @@
 
 #include "mlir/Transforms/Passes.h"
 
-#include "mlir/IR/AffineExpr.h"
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/StandardOps/StandardOps.h"
@@ -94,8 +94,7 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) {
   auto *newMemRefType = doubleShape(cast<MemRefType>(oldMemRef->getType()));
 
   // Create and place the alloc at the top level.
-  auto *func = forStmt->getFunction();
-  MLFuncBuilder topBuilder(func, func->begin());
+  MLFuncBuilder topBuilder(forStmt->getFunction());
   auto *newMemRef = cast<MLValue>(
       topBuilder.create<AllocOp>(forStmt->getLoc(), newMemRefType)
           ->getResult());
@@ -105,13 +104,9 @@ static bool doubleBuffer(MLValue *oldMemRef, ForStmt *forStmt) {
       bInner.getAffineMap(/*dimCount=*/1, /*symbolCount=*/0, {d0 % 2}, {});
   auto ivModTwoOp =
       bInner.create<AffineApplyOp>(forStmt->getLoc(), modTwoMap, forStmt);
-  if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, ivModTwoOp->getResult(0)))
+  if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef,
+                                cast<MLValue>(ivModTwoOp->getResult(0))))
     return false;
-  // We don't need ivMod2Op any more - this is cloned by
-  // replaceAllMemRefUsesWith wherever the memref replacement happens. Once
-  // b/117159533 is addressed, we'll eventually only need to pass
-  // ivModTwoOp->getResult(0) to replaceAllMemRefUsesWith.
-  cast<OperationStmt>(ivModTwoOp->getOperation())->eraseFromBlock();
   return true;
 }
 
@@ -169,16 +164,18 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
   for (auto *dmaStartStmt : dmaStartStmts) {
     MLValue *oldMemRef = cast<MLValue>(dmaStartStmt->getOperand(
         getHigherMemRefPos(dmaStartStmt->getAs<DmaStartOp>())));
-    if (!doubleBuffer(oldMemRef, forStmt))
+    if (!doubleBuffer(oldMemRef, forStmt)) {
       return PassResult::Failure;
+    }
   }
 
   // Double the buffers for tag memref's.
   for (auto *dmaFinishStmt : dmaFinishStmts) {
     MLValue *oldTagMemRef = cast<MLValue>(
         dmaFinishStmt->getOperand(getTagMemRefPos(*dmaFinishStmt)));
-    if (!doubleBuffer(oldTagMemRef, forStmt))
+    if (!doubleBuffer(oldTagMemRef, forStmt)) {
       return PassResult::Failure;
+    }
   }
 
   // Collect all compute ops.
@@ -186,75 +183,43 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
   computeOps.reserve(forStmt->getStatements().size());
   // Store delay for statement for later lookup for AffineApplyOp's.
   DenseMap<const Statement *, unsigned> opDelayMap;
-  for (const auto &stmt : *forStmt) {
+  for (auto &stmt : *forStmt) {
     auto *opStmt = dyn_cast<OperationStmt>(&stmt);
     if (!opStmt) {
       // All for and if stmt's are treated as pure compute operations.
-      // TODO(bondhugula): check whether such statements do not have any DMAs
-      // nested within.
       opDelayMap[&stmt] = 1;
     } else if (opStmt->is<DmaStartOp>()) {
       // DMA starts are not shifted.
-      opDelayMap[&stmt] = 0;
+      opDelayMap[opStmt] = 0;
+      // Set shifts for DMA start stmt's affine operand computation slices to 0.
+      if (auto *slice = mlir::createAffineComputationSlice(opStmt)) {
+        opDelayMap[slice] = 0;
+      } else {
+        // If a slice wasn't created, the reachable affine_apply op's from its
+        // operands are the ones that go with it.
+        SmallVector<OperationStmt *, 4> affineApplyStmts;
+        SmallVector<MLValue *, 4> operands(opStmt->getOperands());
+        getReachableAffineApplyOps(operands, affineApplyStmts);
+        for (auto *op : affineApplyStmts) {
+          opDelayMap[op] = 0;
+        }
+      }
     } else if (opStmt->is<DmaWaitOp>()) {
       // DMA finish op shifted by one.
-      opDelayMap[&stmt] = 1;
-    } else if (!opStmt->is<AffineApplyOp>()) {
-      // Compute op shifted by one.
-      opDelayMap[&stmt] = 1;
+      opDelayMap[opStmt] = 1;
+    } else {
+      // Everything else is a compute op; so shifted by one (op's supplying
+      // 'affine' operands to DMA start's have already been set right shifts.
+      opDelayMap[opStmt] = 1;
       computeOps.push_back(&stmt);
     }
-    // Shifts for affine apply op's determined later.
-  }
-
-  // Get the ancestor of a 'stmt' that lies in forStmt's block.
-  auto getAncestorInForBlock =
-      [&](const Statement *stmt, const StmtBlock &block) -> const Statement * {
-    // Traverse up the statement hierarchy starting from the owner of operand to
-    // find the ancestor statement that resides in the block of 'forStmt'.
-    while (stmt != nullptr && stmt->getBlock() != &block) {
-      stmt = stmt->getParentStmt();
-    }
-    return stmt;
-  };
-
-  // Determine delays for affine apply op's: look up delay from its consumer op.
-  // This code will be thrown away once we have a way to obtain indices through
-  // a composed affine_apply op. See TODO(b/117159533). Such a composed
-  // affine_apply will be used exclusively by a given memref deferencing op.
-  for (const auto &stmt : *forStmt) {
-    auto *opStmt = dyn_cast<OperationStmt>(&stmt);
-    // Skip statements that aren't affine apply ops.
-    if (!opStmt || !opStmt->is<AffineApplyOp>())
-      continue;
-    // Traverse uses of each result of the affine apply op.
-    for (auto *res : opStmt->getResults()) {
-      for (auto &use : res->getUses()) {
-        auto *ancestorInForBlock =
-            getAncestorInForBlock(use.getOwner(), *forStmt);
-        assert(ancestorInForBlock &&
-               "traversing parent should reach forStmt block");
-        auto *opCheck = dyn_cast<OperationStmt>(ancestorInForBlock);
-        if (!opCheck || opCheck->is<AffineApplyOp>())
-          continue;
-        assert(opDelayMap.find(ancestorInForBlock) != opDelayMap.end());
-        if (opDelayMap.find(&stmt) != opDelayMap.end()) {
-          // This is where we enforce all uses of this affine_apply to have
-          // the same shifts - so that we know what shift to use for the
-          // affine_apply to preserve semantics.
-          assert(opDelayMap[&stmt] == opDelayMap[ancestorInForBlock]);
-        } else {
-          // Obtain delay from its consumer.
-          opDelayMap[&stmt] = opDelayMap[ancestorInForBlock];
-        }
-      }
-    }
   }
 
   // Get delays stored in map.
   std::vector<uint64_t> delays(forStmt->getStatements().size());
   unsigned s = 0;
   for (const auto &stmt : *forStmt) {
+    assert(opDelayMap.find(&stmt) != opDelayMap.end());
     delays[s++] = opDelayMap[&stmt];
   }
 
@@ -263,8 +228,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
     return PassResult::Failure;
   }
 
-  if (stmtBodySkew(forStmt, delays))
+  if (stmtBodySkew(forStmt, delays)) {
     return PassResult::Failure;
+  }
 
   return PassResult::Success;
 }
diff --git a/mlir/lib/Transforms/Utils.cpp b/mlir/lib/Transforms/Utils.cpp
index 2e8f0d32736..62ef3ba225a 100644
--- a/mlir/lib/Transforms/Utils.cpp
+++ b/mlir/lib/Transforms/Utils.cpp
@@ -22,7 +22,8 @@
 
 #include "mlir/Transforms/Utils.h"
 
-#include "mlir/IR/AffineMap.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/StandardOps/StandardOps.h"
@@ -48,7 +49,7 @@ static bool isMemRefDereferencingOp(const Operation &op) {
 // TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
 // extended to add additional indices at any position.
 bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef,
-                                    ArrayRef<SSAValue *> extraIndices,
+                                    ArrayRef<MLValue *> extraIndices,
                                     AffineMap indexRemap) {
   unsigned newMemRefRank = cast<MemRefType>(newMemRef->getType())->getRank();
   (void)newMemRefRank; // unused in opt mode
@@ -101,24 +102,16 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef,
     operands.push_back(newMemRef);
 
     MLFuncBuilder builder(opStmt);
-    // Normally, we could just use extraIndices as operands, but we will
-    // clone it so that each op gets its own "private" index. See b/117159533.
     for (auto *extraIndex : extraIndices) {
-      OperationStmt::OperandMapTy operandMap;
       // TODO(mlir-team): An operation/SSA value should provide a method to
       // return the position of an SSA result in its defining
       // operation.
       assert(extraIndex->getDefiningStmt()->getNumResults() == 1 &&
              "single result op's expected to generate these indices");
-      // TODO: actually check if this is a result of an affine_apply op.
       assert((cast<MLValue>(extraIndex)->isValidDim() ||
               cast<MLValue>(extraIndex)->isValidSymbol()) &&
              "invalid memory op index");
-      auto *clonedExtraIndex =
-          cast<OperationStmt>(
-              builder.clone(*extraIndex->getDefiningStmt(), operandMap))
-              ->getResult(0);
-      operands.push_back(cast<MLValue>(clonedExtraIndex));
+      operands.push_back(cast<MLValue>(extraIndex));
     }
 
     // Construct new indices. The indices of a memref come right after it, i.e.,
@@ -163,3 +156,135 @@ bool mlir::replaceAllMemRefUsesWith(MLValue *oldMemRef, MLValue *newMemRef,
   }
   return true;
 }
+
+// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+// its results equal to the number of 'operands, as a composition
+// of all other AffineApplyOps reachable from input parameter 'operands'. If the
+// operands were drawing results from multiple affine apply ops, this also leads
+// to a collapse into a single affine apply op. The final results of the
+// composed AffineApplyOp are returned in output parameter 'results'.
+OperationStmt *
+mlir::createComposedAffineApplyOp(MLFuncBuilder *builder, Location *loc,
+                                  ArrayRef<MLValue *> operands,
+                                  ArrayRef<OperationStmt *> affineApplyOps,
+                                  SmallVectorImpl<SSAValue *> &results) {
+  // Create identity map with same number of dimensions as number of operands.
+  auto map = builder->getMultiDimIdentityMap(operands.size());
+  // Initialize AffineValueMap with identity map.
+  AffineValueMap valueMap(map, operands);
+
+  for (auto *opStmt : affineApplyOps) {
+    assert(opStmt->is<AffineApplyOp>());
+    auto affineApplyOp = opStmt->getAs<AffineApplyOp>();
+    // Forward substitute 'affineApplyOp' into 'valueMap'.
+    valueMap.forwardSubstitute(*affineApplyOp);
+  }
+  // Compose affine maps from all ancestor AffineApplyOps.
+  // Create new AffineApplyOp from 'valueMap'.
+  unsigned numOperands = valueMap.getNumOperands();
+  SmallVector<SSAValue *, 4> outOperands(numOperands);
+  for (unsigned i = 0; i < numOperands; ++i) {
+    outOperands[i] = valueMap.getOperand(i);
+  }
+  // Create new AffineApplyOp based on 'valueMap'.
+  auto affineApplyOp =
+      builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), outOperands);
+  results.resize(operands.size());
+  for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+    results[i] = affineApplyOp->getResult(i);
+  }
+  return cast<OperationStmt>(affineApplyOp->getOperation());
+}
+
+/// Given an operation statement, inserts a new single affine apply operation,
+/// that is exclusively used by this operation statement, and that provides all
+/// operands that are results of an affine_apply as a function of loop iterators
+/// and program parameters and whose results are.
+///
+/// Before
+///
+/// for %i = 0 to #map(%N)
+///   %idx = affine_apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   "compute"(%idx)
+///
+/// After
+///
+/// for %i = 0 to #map(%N)
+///   %idx = affine_apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   %idx_ = affine_apply (d0) -> (d0 mod 2) (%i)
+///   "compute"(%idx_)
+///
+/// This allows applying different transformations on send and compute (for eg.
+/// different shifts/delays).
+///
+/// Returns nullptr if none of the operands were the result of an affine_apply
+/// and thus there was no affine computation slice to create. Returns the newly
+/// affine_apply operation statement otherwise.
+///
+///
+OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) {
+  // Collect all operands that are results of affine apply ops.
+  SmallVector<MLValue *, 4> subOperands;
+  subOperands.reserve(opStmt->getNumOperands());
+  for (auto *operand : opStmt->getOperands()) {
+    auto *defStmt = operand->getDefiningStmt();
+    if (defStmt && defStmt->is<AffineApplyOp>()) {
+      subOperands.push_back(operand);
+    }
+  }
+
+  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
+  SmallVector<OperationStmt *, 4> affineApplyOps;
+  getReachableAffineApplyOps(subOperands, affineApplyOps);
+  // Skip transforming if there are no affine maps to compose.
+  if (affineApplyOps.empty())
+    return nullptr;
+
+  // Check if all uses of the affine apply op's lie in this op stmt
+  // itself, in which case there would be nothing to do.
+  bool localized = true;
+  for (auto *op : affineApplyOps) {
+    for (auto *result : op->getResults()) {
+      for (auto &use : result->getUses()) {
+        if (use.getOwner() != opStmt) {
+          localized = false;
+          break;
+        }
+      }
+    }
+  }
+  if (localized)
+    return nullptr;
+
+  MLFuncBuilder builder(opStmt);
+  SmallVector<SSAValue *, 4> results;
+  auto *affineApplyStmt = createComposedAffineApplyOp(
+      &builder, opStmt->getLoc(), subOperands, affineApplyOps, results);
+  assert(results.size() == subOperands.size() &&
+         "number of results should be the same as the number of subOperands");
+
+  // Construct the new operands that include the results from the composed
+  // affine apply op above instead of existing ones (subOperands). So, they
+  // differ from opStmt's operands only for those operands in 'subOperands', for
+  // which they will be replaced by the corresponding one from 'results'.
+  SmallVector<MLValue *, 4> newOperands(opStmt->getOperands());
+  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
+    // Replace the subOperands from among the new operands.
+    unsigned j, f;
+    for (j = 0, f = subOperands.size(); j < f; j++) {
+      if (newOperands[i] == subOperands[j])
+        break;
+    }
+    if (j < subOperands.size()) {
+      newOperands[i] = cast<MLValue>(results[j]);
+    }
+  }
+
+  for (unsigned idx = 0; idx < newOperands.size(); idx++) {
+    opStmt->setOperand(idx, newOperands[idx]);
+  }
+
+  return affineApplyStmt;
+}
diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir
index 13abd43a2dc..015c59b9944 100644
--- a/mlir/test/Transforms/pipeline-data-transfer.mlir
+++ b/mlir/test/Transforms/pipeline-data-transfer.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s
-// CHECK:      #map0 = (d0) -> (d0 mod 2)
+
+// CHECK:      #map0 = (d0) -> (d0 mod 2, d0 mod 2)
 // CHECK-NEXT: #map1 = (d0) -> (d0 - 1)
+// CHECK-NEXT: #map2 = (d0) -> (d0 mod 2)
 // CHECK-NEXT: mlfunc @loop_nest_dma() {
 // CHECK-NEXT:   %c8 = constant 8 : index
 // CHECK-NEXT:   %c0 = constant 0 : index
@@ -12,34 +14,30 @@
 // CHECK-NEXT:   %c0_0 = constant 0 : index
 // CHECK-NEXT:   %c128 = constant 128 : index
 // CHECK-NEXT:   %5 = affine_apply #map0(%c0)
-// CHECK-NEXT:   %6 = affine_apply #map0(%c0)
-// CHECK-NEXT:   dma_start %2[%c0], %1[%5, %c0], %c128, %0[%6, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32>
+// CHECK-NEXT:   dma_start %2[%c0], %1[%5#0, %c0], %c128, %0[%5#1, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32>
 // CHECK-NEXT:   for %i0 = 1 to 7 {
-// CHECK-NEXT:     %7 = affine_apply #map0(%i0)
-// CHECK-NEXT:     %8 = affine_apply #map0(%i0)
-// CHECK-NEXT:     dma_start %2[%i0], %1[%7, %i0], %c128, %0[%8, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32>
-// CHECK-NEXT:     %9 = affine_apply #map1(%i0)
-// CHECK-NEXT:     %10 = affine_apply #map0(%9)
-// CHECK-NEXT:     dma_wait %0[%10, %c0_0] : memref<2x1xf32>
-// CHECK-NEXT:     %11 = affine_apply #map0(%9)
-// CHECK-NEXT:     %12 = load %1[%11, %9] : memref<2x32xf32>
-// CHECK-NEXT:     %13 = "compute"(%12) : (f32) -> f32
-// CHECK-NEXT:     %14 = affine_apply #map0(%9)
-// CHECK-NEXT:     store %13, %1[%14, %9] : memref<2x32xf32>
+// CHECK-NEXT:     %6 = affine_apply #map0(%i0)
+// CHECK-NEXT:     dma_start %2[%i0], %1[%6#0, %i0], %c128, %0[%6#1, %c0_0] : memref<256xf32, (d0) -> (d0)>, memref<2x32xf32>, memref<2x1xf32>
+// CHECK-NEXT:     %7 = affine_apply #map1(%i0)
+// CHECK-NEXT:     %8 = affine_apply #map2(%7)
+// CHECK-NEXT:     %9 = affine_apply #map2(%7)
+// CHECK-NEXT:     dma_wait %0[%8, %c0_0] : memref<2x1xf32>
+// CHECK-NEXT:    %10 = load %1[%9, %7] : memref<2x32xf32>
+// CHECK-NEXT:     %11 = "compute"(%10) : (f32) -> f32
+// CHECK-NEXT:     store %11, %1[%9, %7] : memref<2x32xf32>
 // CHECK-NEXT:     for %i1 = 0 to 127 {
-// CHECK-NEXT:       "do_more_compute"(%9, %i1) : (index, index) -> ()
+// CHECK-NEXT:       "do_more_compute"(%7, %i1) : (index, index) -> ()
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
-// CHECK-NEXT:   %15 = affine_apply #map1(%c8)
-// CHECK-NEXT:   %16 = affine_apply #map0(%15)
-// CHECK-NEXT:   dma_wait %0[%16, %c0_0] : memref<2x1xf32>
-// CHECK-NEXT:   %17 = affine_apply #map0(%15)
-// CHECK-NEXT:   %18 = load %1[%17, %15] : memref<2x32xf32>
-// CHECK-NEXT:   %19 = "compute"(%18) : (f32) -> f32
-// CHECK-NEXT:   %20 = affine_apply #map0(%15)
-// CHECK-NEXT:   store %19, %1[%20, %15] : memref<2x32xf32>
+// CHECK-NEXT:   %12 = affine_apply #map1(%c8)
+// CHECK-NEXT:   %13 = affine_apply #map2(%12)
+// CHECK-NEXT:   %14 = affine_apply #map2(%12)
+// CHECK-NEXT:   dma_wait %0[%13, %c0_0] : memref<2x1xf32>
+// CHECK-NEXT:   %15 = load %1[%14, %12] : memref<2x32xf32>
+// CHECK-NEXT:   %16 = "compute"(%15) : (f32) -> f32
+// CHECK-NEXT:   store %16, %1[%14, %12] : memref<2x32xf32>
 // CHECK-NEXT:   for %i2 = 0 to 127 {
-// CHECK-NEXT:     "do_more_compute"(%15, %i2) : (index, index) -> ()
+// CHECK-NEXT:     "do_more_compute"(%12, %i2) : (index, index) -> ()
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
 mlfunc @loop_nest_dma() {
author	Uday Bondhugula <bondhugula@google.com>	2018-10-12 14:54:54 -0700
committer	jpienaar <jpienaar@google.com>	2019-03-29 13:29:21 -0700
commit	86eac4618c06a54c1f6d95a8c9d94b15dda5e35b (patch)
tree	a6574eb104b867e24814319c44b26d953527f4c2 /mlir
parent	9e3b928e32285bf47d366d5685d2c65e616544cb (diff)
download	bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.tar.gz bcm5719-llvm-86eac4618c06a54c1f6d95a8c9d94b15dda5e35b.zip