6 files changed, 128 insertions, 85 deletions
diff --git a/mlir/lib/AffineOps/AffineOps.cpp b/mlir/lib/AffineOps/AffineOps.cpp
index 0dea1441b08..1859a640a45 100644
--- a/mlir/lib/AffineOps/AffineOps.cpp
+++ b/mlir/lib/AffineOps/AffineOps.cpp
@@ -1047,18 +1047,19 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
     LLVM_DEBUG(llvm::dbgs()
                << "Domain conservative: non-unit stride not handled\n");
 
+  int64_t step = forOp->getStep();
+
   // Adds a lower or upper bound when the bounds aren't constant.
   auto addLowerOrUpperBound = [&](bool lower) -> bool {
     auto operands =
         lower ? forOp->getLowerBoundOperands() : forOp->getUpperBoundOperands();
     for (const auto &operand : operands) {
-      unsigned loc;
-      if (!constraints->findId(*operand, &loc)) {
+      unsigned pos;
+      if (!constraints->findId(*operand, &pos)) {
         if (isValidSymbol(operand)) {
           constraints->addSymbolId(constraints->getNumSymbolIds(),
                                    const_cast<Value *>(operand));
-          loc =
-              constraints->getNumDimIds() + constraints->getNumSymbolIds() - 1;
+          pos = constraints->getNumDimAndSymbolIds() - 1;
           // Check if the symbol is a constant.
           if (auto *opInst = operand->getDefiningInst()) {
             if (auto constOp = opInst->dyn_cast<ConstantIndexOp>()) {
@@ -1068,17 +1069,22 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
         } else {
           constraints->addDimId(constraints->getNumDimIds(),
                                 const_cast<Value *>(operand));
-          loc = constraints->getNumDimIds() - 1;
+          pos = constraints->getNumDimIds() - 1;
+          if (auto loop = getForInductionVarOwner(operand)) {
+            // Outer loop IVs could be used in forOp's bounds.
+            if (!addAffineForOpDomain(loop, constraints))
+              return false;
+          }
         }
       }
     }
     // Record positions of the operands in the constraint system.
     SmallVector<unsigned, 8> positions;
     for (const auto &operand : operands) {
-      unsigned loc;
-      if (!constraints->findId(*operand, &loc))
+      unsigned pos;
+      if (!constraints->findId(*operand, &pos))
         assert(0 && "expected to be found");
-      positions.push_back(loc);
+      positions.push_back(pos);
     }
 
     auto boundMap =
@@ -1106,7 +1112,7 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
       ineq[constraints->getNumCols() - 1] =
           lower ? -flatExpr[flatExpr.size() - 1]
                 // Upper bound in flattenedExpr is an exclusive one.
-                : flatExpr[flatExpr.size() - 1] - 1;
+                : flatExpr[flatExpr.size() - 1] - step;
       constraints->addInequality(ineq);
     }
     return true;
@@ -1121,7 +1127,8 @@ bool mlir::addAffineForOpDomain(ConstOpPointer<AffineForOp> forOp,
   }
 
   if (forOp->hasConstantUpperBound()) {
-    constraints->addConstantUpperBound(pos, forOp->getConstantUpperBound() - 1);
+    constraints->addConstantUpperBound(pos,
+                                       forOp->getConstantUpperBound() - step);
     return true;
   }
   // Non-constant upper bound case.
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index 821f19df31d..3fbea075386 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -28,6 +28,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/StandardOps/StandardOps.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -35,6 +36,8 @@
 
 using namespace mlir;
 
+using llvm::SmallDenseMap;
+
 /// Populates 'loops' with IVs of the loops surrounding 'inst' ordered from
 /// the outermost 'for' instruction to the innermost one.
 void mlir::getLoopIVs(const Instruction &inst,
@@ -133,6 +136,9 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth,
 
   unsigned rank = access.getRank();
 
+  LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *inst
+                          << "depth: " << loopDepth << "\n";);
+
   if (rank == 0) {
     SmallVector<OpPointer<AffineForOp>, 4> ivs;
     getLoopIVs(*inst, &ivs);
@@ -607,36 +613,41 @@ unsigned mlir::getNumCommonSurroundingLoops(const Instruction &A,
   return numCommonLoops;
 }
 
-Optional<int64_t>
-mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
-                              int memorySpace) {
-  return getMemoryFootprintBytes(*forOp->getBody(), memorySpace);
-}
+static Optional<int64_t> getMemoryFootprintBytes(const Block &block,
+                                                 Block::const_iterator start,
+                                                 Block::const_iterator end,
+                                                 int memorySpace) {
+  SmallDenseMap<Value *, std::unique_ptr<MemRefRegion>, 4> regions;
 
-Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,
-                                                int memorySpace) {
-  std::vector<std::unique_ptr<MemRefRegion>> regions;
+  // Cast away constness since the walker uses non-const versions; but we
+  // guarantee that the visitor callback isn't mutating opInst.
+  auto *cStart = reinterpret_cast<Block::iterator *>(&start);
+  auto *cEnd = reinterpret_cast<Block::iterator *>(&end);
 
   // Walk this 'for' instruction to gather all memory regions.
   bool error = false;
-  const_cast<Block *>(&block)->walk([&](Instruction *opInst) {
+  const_cast<Block *>(&block)->walk(*cStart, *cEnd, [&](Instruction *opInst) {
     if (!opInst->isa<LoadOp>() && !opInst->isa<StoreOp>()) {
       // Neither load nor a store op.
       return;
     }
 
-    // TODO(bondhugula): eventually, we need to be performing a union across
-    // all regions for a given memref instead of creating one region per
-    // memory op. This way we would be allocating O(num of memref's) sets
-    // instead of O(num of load/store op's).
+    // Compute the memref region symbolic in any IVs enclosing this block.
     auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
-    if (!region->compute(opInst, /*loopDepth=*/0)) {
-      LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n");
-      // TODO: stop the walk if an error occurred.
+    if (!region->compute(opInst,
+                         /*loopDepth=*/getNestingDepth(*block.begin()))) {
+      opInst->emitError("Error obtaining memory region\n");
+      error = true;
+      return;
+    }
+    auto it = regions.find(region->memref);
+    if (it == regions.end()) {
+      regions[region->memref] = std::move(region);
+    } else if (!it->second->unionBoundingBox(*region)) {
+      opInst->emitError("Error performing a union on a memory region\n");
       error = true;
       return;
     }
-    regions.push_back(std::move(region));
   });
 
   if (error)
@@ -644,10 +655,19 @@ Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,
 
   int64_t totalSizeInBytes = 0;
   for (const auto &region : regions) {
-    auto size = region->getRegionSize();
+    Optional<int64_t> size = region.second->getRegionSize();
     if (!size.hasValue())
       return None;
     totalSizeInBytes += size.getValue();
   }
   return totalSizeInBytes;
 }
+
+Optional<int64_t>
+mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
+                              int memorySpace) {
+  auto *forInst = forOp->getInstruction();
+  return ::getMemoryFootprintBytes(
+      *forInst->getBlock(), Block::const_iterator(forInst),
+      std::next(Block::const_iterator(forInst)), memorySpace);
+}
diff --git a/mlir/lib/IR/AffineStructures.cpp b/mlir/lib/IR/AffineStructures.cpp
index 5179e935007..d043e78f059 100644
--- a/mlir/lib/IR/AffineStructures.cpp
+++ b/mlir/lib/IR/AffineStructures.cpp
@@ -2059,6 +2059,7 @@ bool FlatAffineConstraints::unionBoundingBox(
     auto extent = getConstantBoundOnDimSize(d, &lb, &lbDivisor);
     if (!extent.hasValue())
       // TODO(bondhugula): symbolic extents when necessary.
+      // TODO(bondhugula): handle union if a dimension is unbounded.
       return false;
 
     otherLb.clear();
diff --git a/mlir/lib/Transforms/DmaGeneration.cpp b/mlir/lib/Transforms/DmaGeneration.cpp
index 06aa3758712..abf693657be 100644
--- a/mlir/lib/Transforms/DmaGeneration.cpp
+++ b/mlir/lib/Transforms/DmaGeneration.cpp
@@ -49,14 +49,18 @@ static llvm::cl::opt<unsigned> clFastMemoryCapacity(
         "Set fast memory space capacity in KiB (default: unlimited)"),
     llvm::cl::cat(clOptionsCategory));
 
-static const unsigned kDefaultFastMemorySpace = 1;
-
 static llvm::cl::opt<unsigned> clFastMemorySpace(
-    "dma-fast-mem-space", llvm::cl::init(kDefaultFastMemorySpace),
+    "dma-fast-mem-space", llvm::cl::init(1),
     llvm::cl::desc(
         "Fast memory space identifier for DMA generation (default: 1)"),
     llvm::cl::cat(clOptionsCategory));
 
+static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
+    "dma-skip-non-unit-stride-loops", llvm::cl::Hidden, llvm::cl::init(false),
+    llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
+                   "for DMA placement"),
+    llvm::cl::cat(clOptionsCategory));
+
 namespace {
 
 /// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
@@ -76,7 +80,7 @@ struct DmaGeneration : public FunctionPass {
         fastMemCapacityBytes(fastMemCapacityBytes) {}
 
   PassResult runOnFunction(Function *f) override;
-  bool runOnBlock(Block *block, uint64_t consumedCapacityBytes);
+  bool runOnBlock(Block *block);
   uint64_t runOnBlock(Block::iterator begin, Block::iterator end);
 
   bool generateDma(const MemRefRegion &region, Block *block,
@@ -457,12 +461,10 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
 /// `regions`; each region is either a sequence of one or more instructions
 /// starting and ending with a load or store op, or just a loop (which could
 /// have other loops nested within). Returns false on an error, true otherwise.
-bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
+bool DmaGeneration::runOnBlock(Block *block) {
   if (block->empty())
     return true;
 
-  uint64_t priorConsumedCapacityBytes = consumedCapacityBytes;
-
   // Every loop in the block starts and ends a region. A contiguous sequence of
   // operation instructions starting and ending with a load/store op is also
   // identified as a region. Straightline code (contiguous chunks of operation
@@ -482,35 +484,37 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
 
   for (auto it = curBegin; it != block->end(); ++it) {
     if (auto forOp = it->dyn_cast<AffineForOp>()) {
-      // We'll assume for now that loops with steps are tiled loops, and so DMAs
-      // are not performed for that depth, but only further inside.
-      // If the memory footprint of the 'for' loop is higher than fast
-      // memory capacity (when provided), we recurse to DMA at an inner level
-      // until we find a depth at which footprint fits in the capacity. If the
-      // footprint can't be calcuated, we assume for now it fits.
-
       // Returns true if the footprint is known to exceed capacity.
       auto exceedsCapacity = [&](OpPointer<AffineForOp> forOp) {
-        Optional<int64_t> footprint;
-        return ((footprint = getMemoryFootprintBytes(forOp, 0)).hasValue() &&
-                consumedCapacityBytes +
-                        static_cast<uint64_t>(footprint.getValue()) >
+        Optional<int64_t> footprint =
+            getMemoryFootprintBytes(forOp,
+                                    /*memorySpace=*/0);
+        return (footprint.hasValue() &&
+                static_cast<uint64_t>(footprint.getValue()) >
                     fastMemCapacityBytes);
       };
 
-      if (forOp->getStep() != 1 || exceedsCapacity(forOp)) {
-        // We'll split and do the DMAs one or more levels inside for forInst
-        consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+      // If the memory footprint of the 'for' loop is higher than fast
+      // memory capacity (when provided), we recurse to DMA at an inner level
+      // until we find a depth at which footprint fits in fast mem capacity. If
+      // the footprint can't be calculated, we assume for now it fits. Recurse
+      // inside if footprint for 'forOp' exceeds capacity, or when
+      // clSkipNonUnitStrideLoop is set and the step size is not one.
+      bool recurseInner = clSkipNonUnitStrideLoop ? forOp->getStep() != 1
+                                                  : exceedsCapacity(forOp);
+      if (recurseInner) {
+        // We'll recurse and do the DMAs at an inner level for 'forInst'.
+        runOnBlock(/*begin=*/curBegin, /*end=*/it);
         // Recurse onto the body of this loop.
-        runOnBlock(forOp->getBody(), consumedCapacityBytes);
+        runOnBlock(forOp->getBody());
         // The next region starts right after the 'for' instruction.
         curBegin = std::next(it);
       } else {
         // We have enough capacity, i.e., DMAs will be computed for the portion
-        // of the block until 'it', and for the 'for' loop. For the
-        // latter, they are placed just before this loop (for incoming DMAs) and
-        // right after (for outgoing ones).
-        consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+        // of the block until 'it', and for 'it', which is 'forOp'. Note that
+        // for the latter, the DMAs are placed just before this loop (for
+        // incoming DMAs) and right after (for outgoing ones).
+        runOnBlock(/*begin=*/curBegin, /*end=*/it);
 
         // Inner loop DMAs have their own scope - we don't thus update consumed
         // capacity. The footprint check above guarantees this inner loop's
@@ -519,7 +523,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
         curBegin = std::next(it);
       }
     } else if (!it->isa<LoadOp>() && !it->isa<StoreOp>()) {
-      consumedCapacityBytes += runOnBlock(/*begin=*/curBegin, /*end=*/it);
+      runOnBlock(/*begin=*/curBegin, /*end=*/it);
       curBegin = std::next(it);
     }
   }
@@ -528,29 +532,7 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
   if (curBegin != block->end()) {
     // Can't be a terminator because it would have been skipped above.
     assert(!curBegin->isKnownTerminator() && "can't be a terminator");
-    consumedCapacityBytes +=
-        runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
-  }
-
-  if (llvm::DebugFlag) {
-    uint64_t thisBlockDmaSizeBytes =
-        consumedCapacityBytes - priorConsumedCapacityBytes;
-    if (thisBlockDmaSizeBytes > 0) {
-      emitNoteForBlock(
-          *block,
-          Twine(llvm::divideCeil(thisBlockDmaSizeBytes, 1024)) +
-              " KiB of DMA buffers in fast memory space for this block\n");
-    }
-  }
-
-  if (consumedCapacityBytes > fastMemCapacityBytes) {
-    StringRef str = "Total size of all DMA buffers' for this block "
-                    "exceeds fast memory capacity\n";
-    if (auto *inst = block->getContainingInst())
-      inst->emitError(str);
-    else
-      block->getFunction()->emitError(str);
-    return false;
+    runOnBlock(/*begin=*/curBegin, /*end=*/block->end());
   }
 
   return true;
@@ -558,6 +540,9 @@ bool DmaGeneration::runOnBlock(Block *block, uint64_t consumedCapacityBytes) {
 
 /// Generates DMAs for a contiguous sequence of instructions in `block` in the
 /// iterator range [begin, end). Returns the total size of the DMA buffers used.
+//  Since we generate alloc's and dealloc's for all DMA buffers (before and
+//  after the range of instructions resp), all of the fast memory capacity is
+//  assumed to be available.
 uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
   if (begin == end)
     return 0;
@@ -575,6 +560,9 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
   writeRegions.clear();
   fastBufferMap.clear();
 
+  // To check for errors when walking the block.
+  bool error = false;
+
   // Walk this range of instructions  to gather all memory regions.
   block->walk(begin, end, [&](Instruction *opInst) {
     // Gather regions to allocate to buffers in faster memory space.
@@ -598,6 +586,7 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
       if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
         LLVM_DEBUG(
             opInst->emitError("Non-constant memref sizes not yet supported"));
+        error = true;
         return;
       }
     }
@@ -628,16 +617,25 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
             LLVM_DEBUG(llvm::dbgs()
                        << "Memory region bounding box failed; "
                           "over-approximating to the entire memref\n");
+            // If the union fails, we will overapproximate.
             if (!getFullMemRefAsRegion(opInst, dmaDepth, region.get())) {
               LLVM_DEBUG(opInst->emitError(
                   "Non-constant memref sizes not yet supported"));
+              error = true;
+              return true;
             }
+            it->second->getConstraints()->clearAndCopyFrom(
+                *region->getConstraints());
           }
           return true;
         };
 
     bool existsInRead = updateRegion(readRegions);
+    if (error)
+      return;
     bool existsInWrite = updateRegion(writeRegions);
+    if (error)
+      return;
 
     // Finally add it to the region list.
     if (region->isWrite() && !existsInWrite) {
@@ -647,6 +645,12 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
     }
   });
 
+  if (error) {
+    begin->emitError(
+        "DMA generation failed for one or more memref's in this block\n");
+    return 0;
+  }
+
   uint64_t totalDmaBuffersSizeInBytes = 0;
   bool ret = true;
   auto processRegions =
@@ -677,12 +681,22 @@ uint64_t DmaGeneration::runOnBlock(Block::iterator begin, Block::iterator end) {
   // For a range of operation instructions, a note will be emitted at the
   // caller.
   OpPointer<AffineForOp> forOp;
+  uint64_t sizeInKib = llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024);
   if (llvm::DebugFlag && (forOp = begin->dyn_cast<AffineForOp>())) {
     forOp->emitNote(
-        Twine(llvm::divideCeil(totalDmaBuffersSizeInBytes, 1024)) +
+        Twine(sizeInKib) +
         " KiB of DMA buffers in fast memory space for this block\n");
   }
 
+  if (totalDmaBuffersSizeInBytes > fastMemCapacityBytes) {
+    StringRef str = "Total size of all DMA buffers' for this block "
+                    "exceeds fast memory capacity\n";
+    if (auto *inst = block->getContainingInst())
+      inst->emitError(str);
+    else
+      block->getFunction()->emitError(str);
+  }
+
   return totalDmaBuffersSizeInBytes;
 }
 
@@ -691,7 +705,7 @@ PassResult DmaGeneration::runOnFunction(Function *f) {
   zeroIndex = topBuilder.create<ConstantIndexOp>(f->getLoc(), 0);
 
   for (auto &block : *f) {
-    runOnBlock(&block, /*consumedCapacityBytes=*/0);
+    runOnBlock(&block);
   }
   // This function never leaves the IR in an invalid state.
   return success();
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp
index b86516f31f7..44798dcee85 100644
--- a/mlir/lib/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Transforms/LoopTiling.cpp
@@ -37,7 +37,7 @@ using namespace mlir;
 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
 
 // List of tile sizes. If any of them aren't provided, they are filled with
-// clTileSize.
+// clTileSize / kDefaultTileSize.
 static llvm::cl::list<unsigned> clTileSizes(
     "tile-sizes",
     llvm::cl::desc(
@@ -50,8 +50,8 @@ namespace {
 struct LoopTiling : public FunctionPass {
   LoopTiling() : FunctionPass(&LoopTiling::passID) {}
   PassResult runOnFunction(Function *f) override;
-  constexpr static unsigned kDefaultTileSize = 4;
 
+  constexpr static unsigned kDefaultTileSize = 4;
   static char passID;
 };
 
@@ -158,7 +158,8 @@ static void constructTiledIndexSetHyperRect(
 UtilResult mlir::tileCodeGen(MutableArrayRef<OpPointer<AffineForOp>> band,
                              ArrayRef<unsigned> tileSizes) {
   assert(!band.empty());
-  assert(band.size() == tileSizes.size());
+  assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes");
+
   // Check if the supplied for inst's are all successively nested.
   for (unsigned i = 1, e = band.size(); i < e; i++) {
     assert(band[i]->getInstruction()->getParentInst() ==
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
index a943ac280d1..a85f428bde6 100644
--- a/mlir/lib/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -261,7 +261,7 @@ PassResult
 PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
   auto mayBeConstTripCount = getConstantTripCount(forOp);
   if (!mayBeConstTripCount.hasValue()) {
-    LLVM_DEBUG(llvm::dbgs() << "unknown trip count loop\n");
+    LLVM_DEBUG(forOp->emitNote("unknown trip count loop"));
     return success();
   }
 
@@ -269,7 +269,7 @@ PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
   findMatchingStartFinishInsts(forOp, startWaitPairs);
 
   if (startWaitPairs.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "No dma start/finish pairs\n";);
+    LLVM_DEBUG(forOp->emitNote("No dma start/finish pairs\n"));
     return success();
   }