Update dma-generate pass to (1) work on blocks of instructions (instead of just

loops), (2) take into account fast memory space capacity and lower 'dmaDepth' to fit, (3) add location information for debug info / errors - change dma-generate pass to work on blocks of instructions (start/end iterators) instead of 'for' loops; complete TODOs - allows DMA generation for straightline blocks of operation instructions interspersed b/w loops - take into account fast memory capacity: check whether memory footprint fits in fastMemoryCapacity parameter, and recurse/lower the depth at which DMA generation is performed until it does fit in the provided memory - add location information to MemRefRegion; any insufficient fast memory capacity errors or debug info w.r.t dma generation shows location information - allow DMA generation pass to be instantiated with a fast memory capacity option (besides command line flag) - change getMemRefRegion to return unique_ptr's - change getMemRefFootprintBytes to work on a 'Block' instead of 'ForInst' - other helper methods; add postDomInstFilter option for replaceAllMemRefUsesWith; drop forInst->walkOps, add Block::walkOps methods Eg. output $ mlir-opt -dma-generate -dma-fast-mem-capacity=1 /tmp/single.mlir /tmp/single.mlir:9:13: error: Total size of all DMA buffers' for this block exceeds fast memory capacity for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) { ^ $ mlir-opt -debug-only=dma-generate -dma-generate -dma-fast-mem-capacity=400 /tmp/single.mlir /tmp/single.mlir:9:13: note: 8 KiB of DMA buffers in fast memory space for this block for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) { PiperOrigin-RevId: 232297044
author: Uday Bondhugula <bondhugula@google.com> 2019-02-04 07:58:42 -0800
committer: jpienaar <jpienaar@google.com> 2019-03-29 16:09:52 -0700
commit: b26900dce55c93043e8f84580df4a1bec65408be (patch)
tree: e752280cbe8f34910fb950f4104f35357f844a5f
parent: 870d7783503962a7043b2654ab82a9d4f4f1a961 (diff)
download: bcm5719-llvm-b26900dce55c93043e8f84580df4a1bec65408be.tar.gz
bcm5719-llvm-b26900dce55c93043e8f84580df4a1bec65408be.zip
12 files changed, 451 insertions, 153 deletions
diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h
index 65af6d7b1f2..54549fc8ef8 100644
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@@ -27,6 +27,7 @@
 
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Location.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include <memory>
@@ -35,8 +36,10 @@ namespace mlir {
 
 class AffineForOp;
 template <typename T> class ConstOpPointer;
+class Block;
 class FlatAffineConstraints;
 class Instruction;
+class Location;
 class MemRefAccess;
 template <typename T> class OpPointer;
 class Instruction;
@@ -73,6 +76,9 @@ unsigned getNestingDepth(const Instruction &stmt);
 // The last field is a 2-d FlatAffineConstraints symbolic in %i.
 //
 struct MemRefRegion {
+  MemRefRegion(Value *memref, Location loc, bool write)
+      : memref(memref), write(write), loc(loc) {}
+
   FlatAffineConstraints *getConstraints() { return &cst; }
   const FlatAffineConstraints *getConstraints() const { return &cst; }
   bool isWrite() const { return write; }
@@ -108,10 +114,13 @@ struct MemRefRegion {
   /// Memref that this region corresponds to.
   Value *memref;
 
-private:
   /// Read or write.
   bool write;
 
+  /// If there is more than one load/store op associated with the region, the
+  /// location information would correspond to one of those op's.
+  Location loc;
+
   /// Region (data space) of the memref accessed. This set will thus have at
   /// least as many dimensional identifiers as the shape dimensionality of the
   /// memref, and these are the leading dimensions of the set appearing in that
@@ -125,7 +134,7 @@ private:
 
 /// Computes the memory region accessed by this memref with the region
 /// represented as constraints symbolic/parameteric in 'loopDepth' loops
-/// surrounding opInst. Returns false if this fails due to yet unimplemented
+/// surrounding opInst. Returns nullptr if this fails due to yet unimplemented
 /// cases. The computed region's 'cst' field has exactly as many dimensional
 /// identifiers as the rank of the memref, and *potentially* additional symbolic
 /// identifiers which could include any of the loop IVs surrounding opInst up
@@ -142,8 +151,8 @@ private:
 ///   {memref = %A, write = false, {%i <= m0 <= %i + 7} }
 /// The last field is a 2-d FlatAffineConstraints symbolic in %i.
 ///
-bool getMemRefRegion(Instruction *opInst, unsigned loopDepth,
-                     MemRefRegion *region);
+std::unique_ptr<MemRefRegion> getMemRefRegion(Instruction *opInst,
+                                              unsigned loopDepth);
 
 /// Returns the size of memref data in bytes if it's statically shaped, None
 /// otherwise.
@@ -199,8 +208,12 @@ insertBackwardComputationSlice(Instruction *srcOpInst, Instruction *dstOpInst,
                                unsigned dstLoopDepth,
                                ComputationSliceState *sliceState);
 
+/// Gets the memory footprint of all data touched in the specified memory space
+/// in bytes; if the memory space is unspecified, considers all memory spaces.
 Optional<int64_t> getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
                                           int memorySpace = -1);
+Optional<int64_t> getMemoryFootprintBytes(const Block &block,
+                                          int memorySpace = -1);
 
 } // end namespace mlir
 
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index d0982630a5a..479f15d1603 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -311,6 +311,19 @@ public:
     return &Block::instructions;
   }
 
+  /// Walk the operation instructions of this block in preorder, calling the
+  /// callback for each operation.
+  void walk(std::function<void(Instruction *)> callback);
+
+  /// Walk the operation instructions in this block in postorder, calling the
+  /// callback for each operation.
+  void walkPostOrder(std::function<void(Instruction *)> callback);
+
+  /// Walk the operation instructions in the specified [begin, end) range of
+  /// this block, calling the callback for each operation.
+  void walk(Block::iterator begin, Block::iterator end,
+            std::function<void(Instruction *)> callback);
+
   void print(raw_ostream &os) const;
   void dump() const;
 
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 3269ac1fdc5..d4aa8a67600 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -24,6 +24,7 @@
 #define MLIR_TRANSFORMS_PASSES_H
 
 #include "mlir/Support/LLVM.h"
+#include <limits>
 
 namespace mlir {
 
@@ -91,9 +92,10 @@ FunctionPass *createLoopTilingPass();
 
 /// Promotes all accessed memref regions to the specified faster memory space
 /// while generating DMAs to move data.
-FunctionPass *createDmaGenerationPass(unsigned lowMemorySpace,
-                                      unsigned highMemorySpace,
-                                      int minDmaTransferSize = 1024);
+FunctionPass *createDmaGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace,
+    int minDmaTransferSize = 1024,
+    uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
 
 /// Creates a pass to lower VectorTransferReadOp and VectorTransferWriteOp.
 FunctionPass *createLowerVectorTransfersPass();
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
index 784e68a5ab3..581c668a154 100644
--- a/mlir/include/mlir/Transforms/Utils.h
+++ b/mlir/include/mlir/Transforms/Utils.h
@@ -42,19 +42,24 @@ class Function;
 /// Replaces all uses of oldMemRef with newMemRef while optionally remapping the
 /// old memref's indices using the supplied affine map, 'indexRemap'. The new
 /// memref could be of a different shape or rank. 'extraIndices' provides
-/// additional access indices to be added to the start. 'indexRemap' remaps
-/// indices of the old memref access to a new set of indices that are used to
-/// index the memref. Additional input operands to indexRemap can be optionally
-/// provided, and they are added at the start of its input list. 'indexRemap' is
-/// expected to have only dimensional inputs, and the number of its inputs equal
-/// to extraOperands.size() plus rank of the memref.  'extraOperands' is an
-/// optional argument that corresponds to additional operands (inputs) for
-/// indexRemap at the beginning of its input list. An additional optional
-/// argument 'domInstFilter' restricts the replacement to only those operations
-/// that are dominated by the former. Returns true on success and false if the
-/// replacement is not possible (whenever a memref is used as an operand in a
-/// non-deferencing scenario). See comments at function definition for an
-/// example.
+/// additional access indices to be added to the start.
+///
+/// 'indexRemap' remaps indices of the old memref access to a new set of indices
+/// that are used to index the memref. Additional input operands to indexRemap
+/// can be optionally provided, and they are added at the start of its input
+/// list. 'indexRemap' is expected to have only dimensional inputs, and the
+/// number of its inputs equal to extraOperands.size() plus rank of the memref.
+/// 'extraOperands' is an optional argument that corresponds to additional
+/// operands (inputs) for indexRemap at the beginning of its input list.
+///
+/// 'domInstFilter', if non-null, restricts the replacement to only those
+/// operations that are dominated by the former; similarly, `postDomInstFilter`
+/// restricts replacement to only those operations that are postdominated by it.
+///
+/// Returns true on success and false if the replacement is not possible
+/// (whenever a memref is used as an operand in a non-deferencing scenario). See
+/// comments at function definition for an example.
+//
 //  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
 //  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
 //  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
@@ -66,7 +71,8 @@ bool replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
                               ArrayRef<Value *> extraIndices = {},
                               AffineMap indexRemap = AffineMap(),
                               ArrayRef<Value *> extraOperands = {},
-                              const Instruction *domInstFilter = nullptr);
+                              const Instruction *domInstFilter = nullptr,
+                              const Instruction *postDomInstFilter = nullptr);
 
 /// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
 /// its results equal to the number of operands, as a composition
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
index 9d1f7481115..468e79b8545 100644
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -2130,7 +2130,7 @@ bool FlatAffineConstraints::unionBoundingBox(
 
     // Identify max.
     auto uRes = compareBounds(ub, otherUb);
-    if (uRes == BoundCmpResult::Greater || res == BoundCmpResult::Equal) {
+    if (uRes == BoundCmpResult::Greater || uRes == BoundCmpResult::Equal) {
       maxUb = ub;
     } else if (uRes == BoundCmpResult::Less) {
       maxUb = otherUb;
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index 24361ac621f..652aaab0e1b 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -122,25 +122,26 @@ bool MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
 //
 // TODO(bondhugula): extend this to any other memref dereferencing ops
 // (dma_start, dma_wait).
-bool mlir::getMemRefRegion(Instruction *opInst, unsigned loopDepth,
-                           MemRefRegion *region) {
+std::unique_ptr<MemRefRegion> mlir::getMemRefRegion(Instruction *opInst,
+                                                    unsigned loopDepth) {
   unsigned rank;
+  std::unique_ptr<MemRefRegion> region;
   SmallVector<Value *, 4> indices;
   if (auto loadOp = opInst->dyn_cast<LoadOp>()) {
     rank = loadOp->getMemRefType().getRank();
     indices.reserve(rank);
     indices.append(loadOp->getIndices().begin(), loadOp->getIndices().end());
-    region->memref = loadOp->getMemRef();
-    region->setWrite(false);
+    region = std::make_unique<MemRefRegion>(loadOp->getMemRef(),
+                                            loadOp->getLoc(), false);
   } else if (auto storeOp = opInst->dyn_cast<StoreOp>()) {
     rank = storeOp->getMemRefType().getRank();
     indices.reserve(rank);
     indices.append(storeOp->getIndices().begin(), storeOp->getIndices().end());
-    region->memref = storeOp->getMemRef();
-    region->setWrite(true);
+    region = std::make_unique<MemRefRegion>(storeOp->getMemRef(),
+                                            storeOp->getLoc(), true);
   } else {
     assert(false && "expected load or store op");
-    return false;
+    return nullptr;
   }
 
   // Build the constraints for this region.
@@ -153,13 +154,15 @@ bool mlir::getMemRefRegion(Instruction *opInst, unsigned loopDepth,
 
     SmallVector<Value *, 8> regionSymbols = extractForInductionVars(ivs);
     regionCst->reset(0, loopDepth, 0, regionSymbols);
-    return true;
+    return region;
   }
 
   FuncBuilder b(opInst);
   auto idMap = b.getMultiDimIdentityMap(rank);
   // Initialize 'accessValueMap' and compose with reachable AffineApplyOps.
   fullyComposeAffineMapAndOperands(&idMap, &indices);
+  // Remove any duplicates.
+  canonicalizeMapAndOperands(&idMap, &indices);
   AffineValueMap accessValueMap(idMap, indices);
   AffineMap accessMap = accessValueMap.getAffineMap();
 
@@ -180,7 +183,7 @@ bool mlir::getMemRefRegion(Instruction *opInst, unsigned loopDepth,
       // TODO(bondhugula): rewrite this to use getInstIndexSet; this way
       // conditionals will be handled when the latter supports it.
       if (!regionCst->addAffineForOpDomain(loop))
-        return false;
+        return nullptr;
     } else {
       // Has to be a valid symbol.
       auto *symbol = accessValueMap.getOperand(i);
@@ -198,7 +201,7 @@ bool mlir::getMemRefRegion(Instruction *opInst, unsigned loopDepth,
   if (!regionCst->composeMap(&accessValueMap)) {
     LLVM_DEBUG(llvm::dbgs() << "getMemRefRegion: compose affine map failed\n");
     LLVM_DEBUG(accessValueMap.getAffineMap().dump());
-    return false;
+    return nullptr;
   }
 
   // Eliminate any loop IVs other than the outermost 'loopDepth' IVs, on which
@@ -233,7 +236,7 @@ bool mlir::getMemRefRegion(Instruction *opInst, unsigned loopDepth,
   LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
   LLVM_DEBUG(region->getConstraints()->dump());
 
-  return true;
+  return region;
 }
 
 //  TODO(mlir-team): improve/complete this when we have target data.
@@ -278,19 +281,20 @@ bool mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
       "argument should be either a LoadOp or a StoreOp");
 
   Instruction *opInst = loadOrStoreOp->getInstruction();
-  MemRefRegion region;
-  if (!getMemRefRegion(opInst, /*loopDepth=*/0, &region))
+
+  auto region = getMemRefRegion(opInst, /*loopDepth=*/0);
+  if (!region)
     return false;
 
   LLVM_DEBUG(llvm::dbgs() << "Memory region");
-  LLVM_DEBUG(region.getConstraints()->dump());
+  LLVM_DEBUG(region->getConstraints()->dump());
 
   bool outOfBounds = false;
   unsigned rank = loadOrStoreOp->getMemRefType().getRank();
 
   // For each dimension, check for out of bounds.
   for (unsigned r = 0; r < rank; r++) {
-    FlatAffineConstraints ucst(*region.getConstraints());
+    FlatAffineConstraints ucst(*region->getConstraints());
 
     // Intersect memory region with constraint capturing out of bounds (both out
     // of upper and out of lower), and check if the constraint system is
@@ -310,7 +314,7 @@ bool mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
     }
 
     // Check for a negative index.
-    FlatAffineConstraints lcst(*region.getConstraints());
+    FlatAffineConstraints lcst(*region->getConstraints());
     std::fill(ineq.begin(), ineq.end(), 0);
     // d_i <= -1;
     lcst.addConstantUpperBound(r, -1);
@@ -519,8 +523,8 @@ MemRefAccess::MemRefAccess(Instruction *loadOrStoreOpInst) {
 
 /// Returns the nesting depth of this statement, i.e., the number of loops
 /// surrounding this statement.
-unsigned mlir::getNestingDepth(const Instruction &stmt) {
-  const Instruction *currInst = &stmt;
+unsigned mlir::getNestingDepth(const Instruction &inst) {
+  const Instruction *currInst = &inst;
   unsigned depth = 0;
   while ((currInst = currInst->getParentInst())) {
     if (currInst->isa<AffineForOp>())
@@ -577,11 +581,16 @@ static Optional<int64_t> getRegionSize(const MemRefRegion &region) {
 Optional<int64_t>
 mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
                               int memorySpace) {
+  return getMemoryFootprintBytes(*forOp->getBody(), memorySpace);
+}
+
+Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,
+                                                int memorySpace) {
   std::vector<std::unique_ptr<MemRefRegion>> regions;
 
   // Walk this 'for' instruction to gather all memory regions.
   bool error = false;
-  const_cast<AffineForOp &>(*forOp).walkOps([&](Instruction *opInst) {
+  const_cast<Block *>(&block)->walk([&](Instruction *opInst) {
     if (!opInst->isa<LoadOp>() && !opInst->isa<StoreOp>()) {
       // Neither load nor a store op.
       return;
@@ -591,8 +600,8 @@ mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
     // all regions for a given memref instead of creating one region per
     // memory op. This way we would be allocating O(num of memref's) sets
     // instead of O(num of load/store op's).
-    auto region = std::make_unique<MemRefRegion>();
-    if (!getMemRefRegion(opInst, 0, region.get())) {
+    auto region = getMemRefRegion(opInst, 0);
+    if (!region) {
       LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n");
       // TODO: stop the walk if an error occurred.
       error = true;
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index 81e70e2b139..698494144ce 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -256,6 +256,37 @@ Block *Block::splitBlock(iterator splitBefore) {
   return newBB;
 }
 
+void Block::walk(std::function<void(OperationInst *)> callback) {
+  walk(begin(), end(), callback);
+}
+
+void Block::walk(Block::iterator begin, Block::iterator end,
+                 std::function<void(OperationInst *)> callback) {
+  struct Walker : public InstWalker<Walker> {
+    std::function<void(OperationInst *)> const &callback;
+    Walker(std::function<void(OperationInst *)> const &callback)
+        : callback(callback) {}
+
+    void visitOperationInst(OperationInst *opInst) { callback(opInst); }
+  };
+
+  Walker w(callback);
+  w.walk(begin, end);
+}
+
+void Block::walkPostOrder(std::function<void(OperationInst *)> callback) {
+  struct Walker : public InstWalker<Walker> {
+    std::function<void(OperationInst *)> const &callback;
+    Walker(std::function<void(OperationInst *)> const &callback)
+        : callback(callback) {}
+
+    void visitOperationInst(OperationInst *opInst) { callback(opInst); }
+  };
+
+  Walker v(callback);
+  v.walkPostOrder(begin(), end());
+}
+
 //===----------------------------------------------------------------------===//
 // BlockList
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp
index 83ec726ec2a..2bbb32036c2 100644
--- a/mlir/lib/Transforms/DmaGeneration.cpp
+++ b/mlir/lib/Transforms/DmaGeneration.cpp
@@ -47,7 +47,7 @@ static llvm::cl::opt<unsigned> clFastMemorySpace(
     llvm::cl::desc("Set fast memory space id for DMA generation"),
     llvm::cl::cat(clOptionsCategory));
 
-static llvm::cl::opt<uint64_t> clFastMemoryCapacity(
+static llvm::cl::opt<unsigned> clFastMemoryCapacity(
     "dma-fast-mem-capacity", llvm::cl::Hidden,
     llvm::cl::desc("Set fast memory space capacity in KiB"),
     llvm::cl::cat(clOptionsCategory));
@@ -57,25 +57,28 @@ namespace {
 /// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
 /// by the latter. Only load op's handled for now.
-/// TODO(bondhugula): extend this to store op's.
+// TODO(bondhugula): We currently can't generate DMAs correctly when stores are
+// strided. Check for strided stores.
+// TODO(mlir-team): we don't insert dealloc's for the DMA buffers; this is thus
+// natural only for scoped allocations.
 struct DmaGeneration : public FunctionPass {
-  explicit DmaGeneration(unsigned slowMemorySpace = 0,
-                         unsigned fastMemorySpaceArg = 1,
-                         int minDmaTransferSize = 1024)
+  explicit DmaGeneration(
+      unsigned slowMemorySpace = 0, unsigned fastMemorySpace = 1,
+      int minDmaTransferSize = 1024,
+      uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max())
       : FunctionPass(&DmaGeneration::passID), slowMemorySpace(slowMemorySpace),
-        minDmaTransferSize(minDmaTransferSize) {
-    if (clFastMemorySpace.getNumOccurrences() > 0) {
-      fastMemorySpace = clFastMemorySpace;
-    } else {
-      fastMemorySpace = fastMemorySpaceArg;
-    }
-  }
+        fastMemorySpace(fastMemorySpace),
+        minDmaTransferSize(minDmaTransferSize),
+        fastMemCapacityBytes(fastMemCapacityBytes) {}
 
   PassResult runOnFunction(Function *f) override;
-  void runOnAffineForOp(OpPointer<AffineForOp> forOp);
+  bool runOnBlock(Block *block, uint64_t consumedCapacityBytes);
+  uint64_t runOnBlock(Block::iterator begin, Block::iterator end);
 
-  bool generateDma(const MemRefRegion &region, OpPointer<AffineForOp> forOp,
-                   uint64_t *sizeInBytes);
+  bool generateDma(const MemRefRegion &region, Block *block,
+                   Block::iterator begin, Block::iterator end,
+                   uint64_t *sizeInBytes, Block::iterator *nBegin,
+                   Block::iterator *nEnd);
 
   // List of memory regions to DMA for. We need a map vector to have a
   // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
@@ -93,6 +96,8 @@ struct DmaGeneration : public FunctionPass {
   unsigned fastMemorySpace;
   // Minimum DMA transfer size supported by the target in bytes.
   const int minDmaTransferSize;
+  // Capacity of the faster memory space.
+  uint64_t fastMemCapacityBytes;
 
   // Constant zero index to avoid too many duplicates.
   Value *zeroIndex = nullptr;
@@ -110,9 +115,10 @@ char DmaGeneration::passID = 0;
 /// TODO(bondhugula): extend this to store op's.
 FunctionPass *mlir::createDmaGenerationPass(unsigned slowMemorySpace,
                                             unsigned fastMemorySpace,
-                                            int minDmaTransferSize) {
-  return new DmaGeneration(slowMemorySpace, fastMemorySpace,
-                           minDmaTransferSize);
+                                            int minDmaTransferSize,
+                                            uint64_t fastMemCapacityBytes) {
+  return new DmaGeneration(slowMemorySpace, fastMemorySpace, minDmaTransferSize,
+                           fastMemCapacityBytes);
 }
 
 // Info comprising stride and number of elements transferred every stride.
@@ -192,26 +198,48 @@ static bool getFullMemRefAsRegion(OperationInst *opInst,
   return true;
 }
 
-// Creates a buffer in the faster memory space for the specified region;
-// generates a DMA from the lower memory space to this one, and replaces all
-// loads to load from that buffer. Returns false if DMAs could not be generated
-// due to yet unimplemented cases.
-bool DmaGeneration::generateDma(const MemRefRegion &region,
-                                OpPointer<AffineForOp> forOp,
-                                uint64_t *sizeInBytes) {
-  auto *forInst = forOp->getInstruction();
+static void emitNoteForBlock(const Block &block, const Twine &message) {
+  auto *inst = block.getContainingInst();
+  if (!inst) {
+    block.getFunction()->emitNote(message);
+  } else {
+    inst->emitNote(message);
+  }
+}
+
+/// Creates a buffer in the faster memory space for the specified region;
+/// generates a DMA from the lower memory space to this one, and replaces all
+/// loads to load from that buffer. Returns false if DMAs could not be generated
+/// due to yet unimplemented cases. `begin` and `end` specify the insertion
+/// points where the incoming DMAs and outgoing DMAs, respectively, should
+/// be inserted (the insertion happens right before the insertion point). Since
+/// `begin` can itself be invalidated due to the memref rewriting done from this
+/// method, the output argument `nBegin` is set to its replacement (set
+/// to `begin` if no invalidation happens). Since outgoing DMAs are inserted at
+/// `end`, the output argument `nEnd` is set to the one following the original
+/// end (since the latter could have been invalidated/replaced). `sizeInBytes`
+/// is set to the size of the DMA buffer allocated.
+bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
+                                Block::iterator begin, Block::iterator end,
+                                uint64_t *sizeInBytes, Block::iterator *nBegin,
+                                Block::iterator *nEnd) {
+  *nBegin = begin;
+  *nEnd = end;
+
+  if (begin == end)
+    return true;
 
   // DMAs for read regions are going to be inserted just before the for loop.
-  FuncBuilder prologue(forInst);
+  FuncBuilder prologue(block, begin);
   // DMAs for write regions are going to be inserted just after the for loop.
-  FuncBuilder epilogue(forInst->getBlock(),
-                       std::next(Block::iterator(forInst)));
+  FuncBuilder epilogue(block, end);
   FuncBuilder *b = region.isWrite() ? &epilogue : &prologue;
 
   // Builder to create constants at the top level.
-  FuncBuilder top(forInst->getFunction());
+  auto *func = block->getFunction();
+  FuncBuilder top(func);
 
-  auto loc = forInst->getLoc();
+  auto loc = region.loc;
   auto *memref = region.memref;
   auto memRefType = memref->getType().cast<MemRefType>();
 
@@ -310,21 +338,17 @@ bool DmaGeneration::generateDma(const MemRefRegion &region,
     auto fastMemRefType = top.getMemRefType(
         fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace);
 
-    LLVM_DEBUG(llvm::dbgs() << "Creating a new buffer of type: ");
-    LLVM_DEBUG(fastMemRefType.dump(); llvm::dbgs() << "\n");
-
     // Create the fast memory space buffer just before the 'for' instruction.
     fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType)->getResult();
     // Record it.
     fastBufferMap[memref] = fastMemRef;
     // fastMemRefType is a constant shaped memref.
     *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
-    LLVM_DEBUG(llvm::dbgs() << "Creating a new buffer of type ";
+    LLVM_DEBUG(emitNoteForBlock(*block, "Creating DMA buffer of type ");
                fastMemRefType.dump();
                llvm::dbgs()
-               << " and size " << Twine(llvm::divideCeil(*sizeInBytes, 1024))
+               << " and of size " << Twine(llvm::divideCeil(*sizeInBytes, 1024))
                << " KiB\n";);
-
   } else {
     // Reuse the one already created.
     fastMemRef = fastBufferMap[memref];
@@ -336,9 +360,6 @@ bool DmaGeneration::generateDma(const MemRefRegion &region,
   auto numElementsSSA =
       top.create<ConstantIndexOp>(loc, numElements.getValue());
 
-  // TODO(bondhugula): check for transfer sizes not being a multiple of
-  // minDmaTransferSize and handle them appropriately.
-
   SmallVector<StrideInfo, 4> strideInfos;
   getMultiLevelStrides(region, fastBufferShape, &strideInfos);
 
@@ -357,6 +378,12 @@ bool DmaGeneration::generateDma(const MemRefRegion &region,
         top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride);
   }
 
+  // Record the last instruction just before the point where we insert the
+  // outgoing DMAs. We later do the memref replacement later only in [begin,
+  // postDomFilter] so that the original memref's in the DMA ops themselves
+  // don't get replaced.
+  auto postDomFilter = std::prev(end);
+
   if (!region.isWrite()) {
     // DMA non-blocking read from original buffer to fast buffer.
     b->create<DmaStartOp>(loc, memref, memIndices, fastMemRef, bufIndices,
@@ -364,9 +391,13 @@ bool DmaGeneration::generateDma(const MemRefRegion &region,
                           numEltPerStride);
   } else {
     // DMA non-blocking write from fast buffer to the original memref.
-    b->create<DmaStartOp>(loc, fastMemRef, bufIndices, memref, memIndices,
-                          numElementsSSA, tagMemRef, zeroIndex, stride,
-                          numEltPerStride);
+    auto op = b->create<DmaStartOp>(loc, fastMemRef, bufIndices, memref,
+                                    memIndices, numElementsSSA, tagMemRef,
+                                    zeroIndex, stride, numEltPerStride);
+    // Since new ops are being appended (for outgoing DMAs), adjust the end to
+    // mark end of range of the original.
+    if (*nEnd == end)
+      *nEnd = Block::iterator(op->getInstruction());
   }
 
   // Matching DMA wait to block on completion; tag always has a 0 index.
@@ -389,45 +420,151 @@ bool DmaGeneration::generateDma(const MemRefRegion &region,
     remapExprs.push_back(dimExpr - offsets[i]);
   }
   auto indexRemap = b->getAffineMap(outerIVs.size() + rank, 0, remapExprs, {});
-  // *Only* those uses within the body of 'forOp' are replaced.
+
+  // Record the begin since it may be invalidated by memref replacement.
+  Block::iterator prev;
+  bool wasAtStartOfBlock = (begin == block->begin());
+  if (!wasAtStartOfBlock)
+    prev = std::prev(begin);
+
+  // *Only* those uses within the range [begin, end) of 'block' are replaced.
   replaceAllMemRefUsesWith(memref, fastMemRef,
                            /*extraIndices=*/{}, indexRemap,
                            /*extraOperands=*/outerIVs,
-                           /*domInstFilter=*/&*forOp->getBody()->begin());
+                           /*domInstFilter=*/&*begin,
+                           /*postDomInstFilter=*/&*postDomFilter);
+
+  *nBegin = wasAtStartOfBlock ? block->begin() : std::next(prev);
+
   return true;
 }
 
-// TODO(bondhugula): make this run on a Block instead of a 'for' inst.
-void DmaGeneration::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
-  // For now (for testing purposes), we'll run this on the outermost among 'for'
-  // inst's with unit stride, i.e., right at the top of the tile if tiling has
-  // been done. In the future, the DMA generation has to be done at a level
-  // where the generated data fits in a higher level of the memory hierarchy; so
-  // the pass has to be instantiated with additional information that we aren't
-  // provided with at the moment.
-  if (forOp->getStep() != 1) {
-    auto *forBody = forOp->getBody();
-    if (forBody->empty())
-      return;
-    if (auto innerFor =
-            cast<OperationInst>(forBody->front()).dyn_cast<AffineForOp>()) {
-      runOnAffineForOp(innerFor);
+/// Generate DMAs for this block. The block is partitioned into separate
+/// `regions`; each region is either a sequence of one or more instructions
+/// starting and ending with a load or store op, or just a loop (which could
+/// have other loops nested within). Returns false on an error, true otherwise.
+bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
+  block->dump();
+  if (block->empty())
+    return true;
+
+  uint64_t priorConsumedCapacityBytes = consumedCapacityBytes;
+
+  // Every loop in the block starts and ends a region. A contiguous sequence of
+  // operation instructions starting and ending with a load/store op is also
+  // identified as a region. Straightline code (contiguous chunks of operation
+  // instructions) are always assumed to not exhaust memory. As a result, this
+  // approach is conservative in some cases at the moment, we do a check later
+  // and report an error with location info.
+  // TODO(bondhugula): An 'if' instruction is being treated similar to an
+  // operation instruction. 'if''s could have 'for's in them; treat them
+  // separately.
+
+  // Get to the first load, store, or for op.
+  auto curBegin =
+      std::find_if(block->begin(), block->end(), [&](const Instruction &inst) {
+        return inst.isa<LoadOp>() || inst.isa<StoreOp>() ||
+               inst.isa<AffineForOp>();
+      });
+
+  for (auto it = curBegin; it != block->end(); ++it) {
+    if (auto forOp = it->dyn_cast<AffineForOp>()) {
+      // We'll assume for now that loops with steps are tiled loops, and so DMAs
+      // are not performed for that depth, but only further inside.
+      // If the memory footprint of the 'for' loop is higher than fast memory
+      // capacity (when provided), we recurse to DMA at an inner level until
+      // we find a depth at which footprint fits in the capacity. If the
+      // footprint can't be calcuated, we assume for now it fits.
+
+      // Returns true if the footprint is known to exceed capacity.
+      auto exceedsCapacity = [&](OpPointer<AffineForOp> forOp) {
+        Optional<int64_t> footprint;
+        return ((footprint = getMemoryFootprintBytes(forOp, 0)).hasValue() &&
+                consumedCapacityBytes +
+                        static_cast<uint64_t>(footprint.getValue()) >
+                    fastMemCapacityBytes);
+      };
+
+      if (forOp->getStep() != 1 || exceedsCapacity(forOp)) {
+        // We'll split and do the DMAs one or more levels inside for forInst
+        consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+        // Recurse onto the body of this loop.
+        runOnBlock(forOp->getBody(), consumedCapacityBytes);
+        // The next region starts right after the 'for' instruction.
+        curBegin = std::next(it);
+      } else {
+        // We have enough capacity, i.e., DMAs will be computed for the portion
+        // of the block until 'it', and for the 'for' loop. For the latter, they
+        // are placed just before this loop (for incoming DMAs) and right after
+        // (for outgoing ones).
+        consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+
+        // Inner loop DMAs have their own scope - we don't thus update consumed
+        // capacity. The footprint check above guarantees this inner loop's
+        // footprint fits.
+        runOnBlock(/*begin=*/it, /*end=*/std::next(it));
+        curBegin = std::next(it);
+      }
+    } else if (!it->isa<LoadOp>() && !it->isa<StoreOp>()) {
+      consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+      curBegin = std::next(it);
     }
-    return;
   }
 
-  // DMAs will be generated for this depth, i.e., for all data accessed by this
-  // loop.
-  unsigned dmaDepth = getNestingDepth(*forOp->getInstruction());
+  // Generate the DMA for the final region.
+  if (curBegin != block->end()) {
+    // Can't be a terminator because it would have been skipped above.
+    assert(!curBegin->isTerminator() && "can't be a terminator");
+    consumedCapacityBytes +=
+        runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
+  }
+
+  if (llvm::DebugFlag) {
+    uint64_t thisBlockDmaSizeBytes =
+        consumedCapacityBytes - priorConsumedCapacityBytes;
+    if (thisBlockDmaSizeBytes > 0) {
+      emitNoteForBlock(
+          *block,
+          Twine(llvm::divideCeil(thisBlockDmaSizeBytes, 1024)) +
+              " KiB of DMA buffers in fast memory space for this block\n");
+    }
+  }
+
+  if (consumedCapacityBytes > fastMemCapacityBytes) {
+    StringRef str = "Total size of all DMA buffers' for this block "
+                    "exceeds fast memory capacity\n";
+    if (auto *inst = block->getContainingInst())
+      inst->emitError(str);
+    else
+      block->getFunction()->emitError(str);
+    return false;
+  }
+
+  return true;
+}
+
+/// Generates DMAs for a contiguous sequence of instructions in `block` in the
+/// iterator range [begin, end). Returns the total size of the DMA buffers used.
+uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
+  if (begin == end)
+    return 0;
+
+  assert(begin->getBlock() == std::prev(end)->getBlock() &&
+         "Inconsistent args");
+
+  Block *block = begin->getBlock();
+
+  // DMAs will be generated for this depth, i.e., symbolic in all loops
+  // surrounding the region of this block.
+  unsigned dmaDepth = getNestingDepth(*begin);
 
   readRegions.clear();
   writeRegions.clear();
   fastBufferMap.clear();
 
-  // Walk this 'for' instruction to gather all memory regions.
-  forOp->walkOps([&](OperationInst *opInst) {
-    // Gather regions to promote to buffers in faster memory space.
-    // TODO(bondhugula): handle store op's; only load's handled for now.
+  // Walk this range of instructions  to gather all memory regions.
+  block->walk(begin, end, [&](OperationInst *opInst) {
+    // Gather regions to allocate to buffers in faster memory space.
     if (auto loadOp = opInst->dyn_cast<LoadOp>()) {
       if (loadOp->getMemRefType().getMemorySpace() != slowMemorySpace)
         return;
@@ -439,18 +576,15 @@ void DmaGeneration::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
       return;
     }
 
-    // TODO(bondhugula): eventually, we need to be performing a union across
-    // all regions for a given memref instead of creating one region per
-    // memory op. This way we would be allocating O(num of memref's) sets
-    // instead of O(num of load/store op's).
-    auto region = std::make_unique<MemRefRegion>();
-    if (!getMemRefRegion(opInst, dmaDepth, region.get())) {
+    // Compute the MemRefRegion accessed.
+    auto region = getMemRefRegion(opInst, dmaDepth);
+    if (!region) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Error obtaining memory region: semi-affine maps?\n");
       LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
       if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
         LLVM_DEBUG(
-            forOp->emitError("Non-constant memref sizes not yet supported"));
+            opInst->emitError("Non-constant memref sizes not yet supported"));
         return;
       }
     }
@@ -477,12 +611,12 @@ void DmaGeneration::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
             return false;
 
           // Perform a union with the existing region.
-          if (!(*it).second->unionBoundingBox(*region)) {
+          if (!it->second->unionBoundingBox(*region)) {
             LLVM_DEBUG(llvm::dbgs()
-                       << "Memory region bounding box failed"
+                       << "Memory region bounding box failed; "
                           "over-approximating to the entire memref\n");
             if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
-              LLVM_DEBUG(forOp->emitError(
+              LLVM_DEBUG(opInst->emitError(
                   "Non-constant memref sizes not yet supported"));
             }
           }
@@ -500,48 +634,59 @@ void DmaGeneration::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
     }
   });
 
-  uint64_t totalSizeInBytes = 0;
-
+  uint64_t totalDmaBuffersSizeInBytes = 0;
   bool ret = true;
   auto processRegions =
       [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
               &regions) {
         for (const auto &regionEntry : regions) {
           uint64_t sizeInBytes;
-          bool iRet = generateDma(*regionEntry.second, forOp, &sizeInBytes);
-          if (iRet)
-            totalSizeInBytes += sizeInBytes;
+          Block::iterator nBegin, nEnd;
+          bool iRet = generateDma(*regionEntry.second, block, begin, end,
+                                  &sizeInBytes, &nBegin, &nEnd);
+          if (iRet) {
+            begin = nBegin;
+            end = nEnd;
+            totalDmaBuffersSizeInBytes += sizeInBytes;
+          }
           ret = ret & iRet;
         }
       };
   processRegions(readRegions);
   processRegions(writeRegions);
+
   if (!ret) {
-    forOp->emitError("DMA generation failed for one or more memref's\n");
-    return;
+    begin->emitError(
+        "DMA generation failed for one or more memref's in this block\n");
+    return totalDmaBuffersSizeInBytes;
   }
-  LLVM_DEBUG(llvm::dbgs() << Twine(llvm::divideCeil(totalSizeInBytes, 1024))
-                          << " KiB of DMA buffers in fast memory space\n";);
-
-  if (clFastMemoryCapacity && totalSizeInBytes > clFastMemoryCapacity) {
-    // TODO(bondhugula): selecting the DMA depth so that the result DMA buffers
-    // fit in fast memory is a TODO - not complex.
-    forOp->emitError(
-        "Total size of all DMA buffers' exceeds memory capacity\n");
+
+  // For a range of operation instructions, a note will be emitted at the
+  // caller.
+  OpPointer<AffineForOp> forOp;
+  if (llvm::DebugFlag && (forOp = begin->dyn_cast<AffineForOp>())) {
+    forOp->emitNote(
+        Twine(llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024)) +
+        " KiB of DMA buffers in fast memory space for this block\n");
   }
+
+  return totalDmaBuffersSizeInBytes;
 }
 
 PassResult DmaGeneration::runOnFunction(Function *f) {
   FuncBuilder topBuilder(f);
-
   zeroIndex = topBuilder.create<ConstantIndexOp>(f->getLoc(), 0);
 
+  if (clFastMemorySpace.getNumOccurrences() > 0) {
+    fastMemorySpace = clFastMemorySpace;
+  }
+
+  if (clFastMemoryCapacity.getNumOccurrences() > 0) {
+    fastMemCapacityBytes = clFastMemoryCapacity * 1024;
+  }
+
   for (auto &block : *f) {
-    for (auto &inst : block) {
-      if (auto forOp = cast<OperationInst>(inst).dyn_cast<AffineForOp>()) {
-        runOnAffineForOp(forOp);
-      }
-    }
+    runOnBlock(&block, /*consumedCapacityBytes=*/0);
   }
   // This function never leaves the IR in an invalid state.
   return success();
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index 5091e3ceb33..162e0e3b7f6 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -929,8 +929,7 @@ static Value *createPrivateMemRef(OpPointer<AffineForOp> forOp,
   unsigned rank = oldMemRefType.getRank();
 
   // Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'.
-  MemRefRegion region;
-  getMemRefRegion(srcStoreOpInst, dstLoopDepth, &region);
+  auto region = getMemRefRegion(srcStoreOpInst, dstLoopDepth);
   SmallVector<int64_t, 4> newShape;
   std::vector<SmallVector<int64_t, 4>> lbs;
   SmallVector<int64_t, 8> lbDivisors;
@@ -938,11 +937,11 @@ static Value *createPrivateMemRef(OpPointer<AffineForOp> forOp,
   // Query 'region' for 'newShape' and lower bounds of MemRefRegion accessed
   // by 'srcStoreOpInst' at depth 'dstLoopDepth'.
   Optional<int64_t> numElements =
-      region.getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
+      region->getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
   assert(numElements.hasValue() &&
          "non-constant number of elts in local buffer");
 
-  const FlatAffineConstraints *cst = region.getConstraints();
+  const FlatAffineConstraints *cst = region->getConstraints();
   // 'outerIVs' holds the values that this memory region is symbolic/paramteric
   // on; this would correspond to loop IVs surrounding the level at which the
   // slice is being materialized.
diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
index 4191a9cc279..e6ce273b532 100644
--- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
+++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -178,9 +178,8 @@ void MemRefDataFlowOpt::visitOperationInst(OperationInst *opInst) {
       // is trivially loading from a single location at that depth; so there
       // isn't a need to call isRangeOneToOne.
       if (getNestingDepth(*storeOpInst) < loadOpDepth) {
-        MemRefRegion region;
-        getMemRefRegion(loadOpInst, nsLoops, &region);
-        if (!region.getConstraints()->isRangeOneToOne(
+        auto region = getMemRefRegion(loadOpInst, nsLoops);
+        if (!region->getConstraints()->isRangeOneToOne(
                 /*start=*/0, /*limit=*/loadOp->getMemRefType().getRank()))
           break;
       }
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index 819f1a59b6f..732062a8b97 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -48,7 +48,8 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
                                     ArrayRef<Value *> extraIndices,
                                     AffineMap indexRemap,
                                     ArrayRef<Value *> extraOperands,
-                                    const Instruction *domInstFilter) {
+                                    const Instruction *domInstFilter,
+                                    const Instruction *postDomInstFilter) {
   unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
   (void)newMemRefRank; // unused in opt mode
   unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
@@ -66,9 +67,14 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
          newMemRef->getType().cast<MemRefType>().getElementType());
 
   std::unique_ptr<DominanceInfo> domInfo;
+  std::unique_ptr<PostDominanceInfo> postDomInfo;
   if (domInstFilter)
     domInfo = std::make_unique<DominanceInfo>(domInstFilter->getFunction());
 
+  if (postDomInstFilter)
+    postDomInfo =
+        std::make_unique<PostDominanceInfo>(postDomInstFilter->getFunction());
+
   // The ops where memref replacement succeeds are replaced with new ones.
   SmallVector<OperationInst *, 8> opsToErase;
 
@@ -81,6 +87,11 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
     if (domInstFilter && !domInfo->dominates(domInstFilter, opInst))
       continue;
 
+    // Skip this use if it's not post-dominated by postDomInstFilter.
+    if (postDomInstFilter &&
+        !postDomInfo->postDominates(postDomInstFilter, opInst))
+      continue;
+
     // Check if the memref was used in a non-deferencing context. It is fine for
     // the memref to be used in a non-deferencing way outside of the region
     // where this replacement is happening.
@@ -167,7 +178,7 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
       res->replaceAllUsesWith(repOp->getResult(r++));
     }
     // Collect and erase at the end since one of these op's could be
-    // domInstFilter!
+    // domInstFilter or postDomInstFilter as well!
     opsToErase.push_back(opInst);
   }
 
diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir
index 9096fe8b097..cdc7441b14e 100644
--- a/mlir/test/Transforms/dma-generate.mlir
+++ b/mlir/test/Transforms/dma-generate.mlir
@@ -262,7 +262,7 @@ func @dma_unknown_size(%arg0: memref<?x?xf32>) {
       // size -- not yet implemented.
       // CHECK: %2 = load %arg0[%i0, %i1] : memref<?x?xf32>
       load %arg0[%i, %j] : memref<? x ? x f32>
-      // expected-error@-6 {{DMA generation failed for one or more memref's}}
+      // expected-error@-6 {{DMA generation failed for one or more memref's in this block}}
     }
   }
   return
@@ -282,7 +282,7 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
         // not yet implemented.
         // CHECK: %5 = load %arg0[%2, %3, %4] : memref<1024x1024x1024xf32>
         %v = load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32>
-        // expected-error@-10 {{DMA generation failed for one or more memref's}}
+        // expected-error@-10 {{DMA generation failed for one or more memref's in this block}}
       }
     }
   }
@@ -359,3 +359,73 @@ func @multi_load_store_union() {
 // CHECK-NEXT:  dma_wait %3[%c0], %c170372 : memref<1xi32>
 // CHECK-NEXT:  return
 // CHECK-NEXT:}
+
+// -----
+
+// CHECK-DAG: [[MAP_MINUS_ONE:#map[0-9]+]] = (d0) -> (d0 - 1)
+
+// CHECK-LABEL: func @dma_loop_straightline_interspersed() {
+func @dma_loop_straightline_interspersed() {
+  %c0 = constant 0 : index
+  %c255 = constant 255 : index
+  %A = alloc() : memref<256 x f32>
+  %v = load %A[%c0] : memref<256 x f32>
+  for %i = 1 to 255 {
+    load %A[%i] : memref<256 x f32>
+  }
+  %l = load %A[%c255] : memref<256 x f32>
+  store %l, %A[%c0] : memref<256 x f32>
+  return
+}
+// There are three regions here - the 'load' preceding the loop, the loop
+// itself, and the instructions appearing after the loop.
+// CHECK:       %0 = alloc() : memref<256xf32>
+// CHECK-NEXT:  %1 = alloc() : memref<1xf32, 1>
+// CHECK-NEXT:  %2 = alloc() : memref<1xi32>
+// CHECK-NEXT:  dma_start %0[%c0], %1[%c0], %c1_1, %2[%c0] : memref<256xf32>, memref<1xf32, 1>, memref<1xi32>
+// CHECK-NEXT:  dma_wait %2[%c0], %c1_1 : memref<1xi32>
+// CHECK-NEXT:  %3 = load %1[%c0_2] : memref<1xf32, 1>
+// CHECK-NEXT:  %4 = alloc() : memref<254xf32, 1>
+// CHECK-NEXT:  %5 = alloc() : memref<1xi32>
+// CHECK-NEXT:  dma_start %0[%c1_0], %4[%c0], %c254, %5[%c0] : memref<256xf32>, memref<254xf32, 1>, memref<1xi32>
+// CHECK-NEXT:  dma_wait %5[%c0], %c254 : memref<1xi32>
+// CHECK-NEXT:  for %i0 = 1 to 255 {
+// CHECK-NEXT:    %6 = affine_apply [[MAP_MINUS_ONE]](%i0)
+// CHECK-NEXT:    %7 = load %4[%6] : memref<254xf32, 1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %8 = alloc() : memref<256xf32, 1>
+// CHECK-NEXT:  %9 = alloc() : memref<1xi32>
+// CHECK-NEXT:  dma_start %0[%c0], %8[%c0], %c256, %9[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
+// CHECK-NEXT:  dma_wait %9[%c0], %c256 : memref<1xi32>
+// CHECK-NEXT:  %10 = alloc() : memref<1xi32>
+// CHECK-NEXT:  %11 = load %8[%c255] : memref<256xf32, 1>
+// CHECK-NEXT:  store %11, %8[%c0_2] : memref<256xf32, 1>
+// CHECK-NEXT:  dma_start %8[%c0], %0[%c0], %c1, %10[%c0] : memref<256xf32, 1>, memref<256xf32>, memref<1xi32>
+// CHECK-NEXT:  dma_wait %10[%c0], %c1 : memref<1xi32>
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-LABEL: func @dma_mixed_loop_blocks() {
+func @dma_mixed_loop_blocks() {
+  %c0 = constant 0 : index
+  %A = alloc() : memref<256 x 256 x vector<8 x f32>>
+  for %i = 0 to 256 {
+    %v = load %A[%c0, %c0] : memref<256 x 256 x vector<8 x f32>>
+    "foo"(%v) : (vector<8 x f32>) -> ()
+    for %j = 0 to 256 {
+      %w = load %A[%i, %j] : memref<256 x 256 x vector<8 x f32>>
+      "bar"(%w) : (vector<8 x f32>) -> ()
+    }
+  }
+  return
+}
+// CHECK-DAG:   [[MEM:%[0-9]+]] = alloc() : memref<256x256xvector<8xf32>>
+// CHECK-DAG:   [[BUF:%[0-9]+]] = alloc() : memref<256x256xvector<8xf32>, 1>
+// CHECK-DAG:   [[TAG:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK:       dma_start [[MEM]][%c0, %c0], [[BUF]][%c0, %c0], %c65536, [[TAG]][%c0] : memref<256x256xvector<8xf32>>, memref<256x256xvector<8xf32>, 1>, memref<1xi32>
+// CHECK-NEXT:  dma_wait [[TAG]][%c0], %c65536 : memref<1xi32>
+// CHECK-NEXT:  for %i0 = 0 to 256 {
+// CHECK-NEXT:    %3 = load [[BUF]][%c0_0, %c0_0] : memref<256x256xvector<8xf32>, 1>
+// CHECK:         for %i1 = 0 to 256 {
+// CHECK-NEXT:      %4 = load [[BUF]][%i0, %i1] : memref<256x256xvector<8xf32>, 1>
author	Uday Bondhugula <bondhugula@google.com>	2019-02-04 07:58:42 -0800
committer	jpienaar <jpienaar@google.com>	2019-03-29 16:09:52 -0700
commit	b26900dce55c93043e8f84580df4a1bec65408be (patch)
tree	e752280cbe8f34910fb950f4104f35357f844a5f
parent	870d7783503962a7043b2654ab82a9d4f4f1a961 (diff)
download	bcm5719-llvm-b26900dce55c93043e8f84580df4a1bec65408be.tar.gz bcm5719-llvm-b26900dce55c93043e8f84580df4a1bec65408be.zip