Refactor the LowerVectorTransfers pass to use the RewritePattern infra - NFC

This is step 1/n in refactoring infrastructure along the Vector dialect to make it ready for retargetability and composable progressive lowering. PiperOrigin-RevId: 280529784
author: Nicolas Vasilache <ntv@google.com> 2019-11-14 15:39:36 -0800
committer: A. Unique TensorFlower <gardener@tensorflow.org> 2019-11-14 15:40:07 -0800
commit: 0b271b7dfe285064b8b237d18bfc923212e7a77b (patch)
tree: 3783613205ec9f87106a6fa7f730e22f97d91745 /mlir/lib/Transforms
parent: a78bd84cf84c00914f48781fa0c561cbb6bdf847 (diff)
download: bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.tar.gz
bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.zip
2 files changed, 0 insertions, 385 deletions
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index de7801bf215..304e0547edb 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -13,7 +13,6 @@ add_llvm_library(MLIRTransforms
   LoopTiling.cpp
   LoopUnrollAndJam.cpp
   LoopUnroll.cpp
-  LowerVectorTransfers.cpp
   MaterializeVectors.cpp
   MemRefDataFlowOpt.cpp
   PipelineDataTransfer.cpp
diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp
deleted file mode 100644
index 57dd18dac0f..00000000000
--- a/mlir/lib/Transforms/LowerVectorTransfers.cpp
+++ /dev/null
@@ -1,384 +0,0 @@
-//===- LowerVectorTransfers.cpp - LowerVectorTransfers Pass Impl ----------===//
-//
-// Copyright 2019 The MLIR Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-//
-// This file implements target-dependent lowering of vector transfer operations.
-//
-//===----------------------------------------------------------------------===//
-
-#include <type_traits>
-
-#include "mlir/Analysis/AffineAnalysis.h"
-#include "mlir/Analysis/NestedMatcher.h"
-#include "mlir/Analysis/Utils.h"
-#include "mlir/Analysis/VectorAnalysis.h"
-#include "mlir/Dialect/LoopOps/LoopOps.h"
-#include "mlir/Dialect/StandardOps/Ops.h"
-#include "mlir/Dialect/VectorOps/VectorOps.h"
-#include "mlir/EDSC/Builders.h"
-#include "mlir/EDSC/Helpers.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/Functional.h"
-#include "mlir/Transforms/Passes.h"
-
-/// Implements lowering of VectorTransferReadOp and VectorTransferWriteOp to a
-/// proper abstraction for the hardware.
-///
-/// For now, we only emit a simple loop nest that performs clipped pointwise
-/// copies from a remote to a locally allocated memory.
-///
-/// Consider the case:
-///
-/// ```mlir {.mlir}
-///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
-///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
-///    %f0 = constant 0.0f : f32
-///    loop.for %i0 = 0 to %0 {
-///      loop.for %i1 = 0 to %1 step %c256 {
-///        loop.for %i2 = 0 to %2 step %c32 {
-///          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
-///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
-///               memref<?x?x?xf32>, vector<32x256xf32>
-///    }}}
-/// ```
-///
-/// The rewriters construct loop and indices that access MemRef A in a pattern
-/// resembling the following (while guaranteeing an always full-tile
-/// abstraction):
-///
-/// ```mlir {.mlir}
-///    loop.for %d2 = 0 to %c256 {
-///      loop.for %d1 = 0 to %c32 {
-///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
-///        %tmp[%d2, %d1] = %s
-///      }
-///    }
-/// ```
-///
-/// In the current state, only a clipping transfer is implemented by `clip`,
-/// which creates individual indexing expressions of the form:
-///
-/// ```mlir-dsc
-///    SELECT(i + ii < zero, zero, SELECT(i + ii < N, i + ii, N - one))
-/// ```
-
-using namespace mlir;
-using vector::VectorTransferReadOp;
-using vector::VectorTransferWriteOp;
-
-#define DEBUG_TYPE "affine-lower-vector-transfers"
-
-namespace {
-
-/// Lowers VectorTransferOp into a combination of:
-///   1. local memory allocation;
-///   2. perfect loop nest over:
-///      a. scalar load/stores from local buffers (viewed as a scalar memref);
-///      a. scalar store/load to original memref (with clipping).
-///   3. vector_load/store
-///   4. local memory deallocation.
-/// Minor variations occur depending on whether a VectorTransferReadOp or
-/// a VectorTransferWriteOp is rewritten.
-template <typename VectorTransferOpTy>
-struct VectorTransferRewriter : public RewritePattern {
-  explicit VectorTransferRewriter(MLIRContext *context)
-      : RewritePattern(VectorTransferOpTy::getOperationName(), 1, context) {}
-
-  /// Used for staging the transfer in a local scalar buffer.
-  MemRefType tmpMemRefType(VectorTransferOpTy transfer) const {
-    auto vectorType = transfer.getVectorType();
-    return MemRefType::get(vectorType.getShape(), vectorType.getElementType(),
-                           {}, 0);
-  }
-
-  /// Performs the rewrite.
-  PatternMatchResult matchAndRewrite(Operation *op,
-                                     PatternRewriter &rewriter) const override;
-};
-
-/// Analyzes the `transfer` to find an access dimension along the fastest remote
-/// MemRef dimension. If such a dimension with coalescing properties is found,
-/// `pivs` and `vectorView` are swapped so that the invocation of
-/// LoopNestBuilder captures it in the innermost loop.
-template <typename VectorTransferOpTy>
-void coalesceCopy(VectorTransferOpTy transfer,
-                  SmallVectorImpl<edsc::ValueHandle *> *pivs,
-                  edsc::VectorView *vectorView) {
-  // rank of the remote memory access, coalescing behavior occurs on the
-  // innermost memory dimension.
-  auto remoteRank = transfer.getMemRefType().getRank();
-  // Iterate over the results expressions of the permutation map to determine
-  // the loop order for creating pointwise copies between remote and local
-  // memories.
-  int coalescedIdx = -1;
-  auto exprs = transfer.permutation_map().getResults();
-  for (auto en : llvm::enumerate(exprs)) {
-    auto dim = en.value().template dyn_cast<AffineDimExpr>();
-    if (!dim) {
-      continue;
-    }
-    auto memRefDim = dim.getPosition();
-    if (memRefDim == remoteRank - 1) {
-      // memRefDim has coalescing properties, it should be swapped in the last
-      // position.
-      assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
-      coalescedIdx = en.index();
-    }
-  }
-  if (coalescedIdx >= 0) {
-    std::swap(pivs->back(), (*pivs)[coalescedIdx]);
-    vectorView->swapRanges(pivs->size() - 1, coalescedIdx);
-  }
-}
-
-/// Emits remote memory accesses that are clipped to the boundaries of the
-/// MemRef.
-template <typename VectorTransferOpTy>
-llvm::SmallVector<edsc::ValueHandle, 8> clip(VectorTransferOpTy transfer,
-                                             edsc::MemRefView &view,
-                                             ArrayRef<edsc::IndexHandle> ivs) {
-  using namespace mlir::edsc;
-  using namespace edsc::op;
-  using edsc::intrinsics::select;
-
-  IndexHandle zero(index_t(0)), one(index_t(1));
-  llvm::SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.indices());
-  llvm::SmallVector<edsc::ValueHandle, 8> clippedScalarAccessExprs(
-      memRefAccess.size(), edsc::IndexHandle());
-
-  // Indices accessing to remote memory are clipped and their expressions are
-  // returned in clippedScalarAccessExprs.
-  for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
-       ++memRefDim) {
-    // Linear search on a small number of entries.
-    int loopIndex = -1;
-    auto exprs = transfer.permutation_map().getResults();
-    for (auto en : llvm::enumerate(exprs)) {
-      auto expr = en.value();
-      auto dim = expr.template dyn_cast<AffineDimExpr>();
-      // Sanity check.
-      assert(
-          (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
-          "Expected dim or 0 in permutationMap");
-      if (dim && memRefDim == dim.getPosition()) {
-        loopIndex = en.index();
-        break;
-      }
-    }
-
-    // We cannot distinguish atm between unrolled dimensions that implement
-    // the "always full" tile abstraction and need clipping from the other
-    // ones. So we conservatively clip everything.
-    auto N = view.ub(memRefDim);
-    auto i = memRefAccess[memRefDim];
-    if (loopIndex < 0) {
-      auto N_minus_1 = N - one;
-      auto select_1 = select(i < N, i, N_minus_1);
-      clippedScalarAccessExprs[memRefDim] = select(i < zero, zero, select_1);
-    } else {
-      auto ii = ivs[loopIndex];
-      auto i_plus_ii = i + ii;
-      auto N_minus_1 = N - one;
-      auto select_1 = select(i_plus_ii < N, i_plus_ii, N_minus_1);
-      clippedScalarAccessExprs[memRefDim] =
-          select(i_plus_ii < zero, zero, select_1);
-    }
-  }
-
-  return clippedScalarAccessExprs;
-}
-
-/// Lowers VectorTransferReadOp into a combination of:
-///   1. local memory allocation;
-///   2. perfect loop nest over:
-///      a. scalar load from local buffers (viewed as a scalar memref);
-///      a. scalar store to original memref (with clipping).
-///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
-///   4. local memory deallocation.
-///
-/// Lowers the data transfer part of a VectorTransferReadOp while ensuring no
-/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
-/// clipping. This means that a given value in memory can be read multiple
-/// times and concurrently.
-///
-/// Important notes about clipping and "full-tiles only" abstraction:
-/// =================================================================
-/// When using clipping for dealing with boundary conditions, the same edge
-/// value will appear multiple times (a.k.a edge padding). This is fine if the
-/// subsequent vector operations are all data-parallel but **is generally
-/// incorrect** in the presence of reductions or extract operations.
-///
-/// More generally, clipping is a scalar abstraction that is expected to work
-/// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
-/// To deal with real vector_load and DMAs, a "padded allocation + view"
-/// abstraction with the ability to read out-of-memref-bounds (but still within
-/// the allocated region) is necessary.
-///
-/// Whether using scalar loops or vector_load/DMAs to perform the transfer,
-/// junk values will be materialized in the vectors and generally need to be
-/// filtered out and replaced by the "neutral element". This neutral element is
-/// op-dependent so, in the future, we expect to create a vector filter and
-/// apply it to a splatted constant vector with the proper neutral element at
-/// each ssa-use. This filtering is not necessary for pure data-parallel
-/// operations.
-///
-/// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
-/// also have concurrency implications. Note that by using clipped scalar stores
-/// in the presence of data-parallel only operations, we generate code that
-/// writes the same value multiple time on the edge locations.
-///
-/// TODO(ntv): implement alternatives to clipping.
-/// TODO(ntv): support non-data-parallel operations.
-
-/// Performs the rewrite.
-template <>
-PatternMatchResult
-VectorTransferRewriter<VectorTransferReadOp>::matchAndRewrite(
-    Operation *op, PatternRewriter &rewriter) const {
-  using namespace mlir::edsc;
-  using namespace mlir::edsc::op;
-  using namespace mlir::edsc::intrinsics;
-  using IndexedValue =
-      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
-
-  VectorTransferReadOp transfer = cast<VectorTransferReadOp>(op);
-
-  // 1. Setup all the captures.
-  ScopedContext scope(rewriter, transfer.getLoc());
-  IndexedValue remote(transfer.memref());
-  MemRefView view(transfer.memref());
-  VectorView vectorView(transfer.vector());
-  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
-  SmallVector<ValueHandle *, 8> pivs =
-      makeIndexHandlePointers(MutableArrayRef<IndexHandle>(ivs));
-  coalesceCopy(transfer, &pivs, &vectorView);
-
-  auto lbs = vectorView.getLbs();
-  auto ubs = vectorView.getUbs();
-  SmallVector<ValueHandle, 8> steps;
-  steps.reserve(vectorView.getSteps().size());
-  for (auto step : vectorView.getSteps())
-    steps.push_back(constant_index(step));
-
-  // 2. Emit alloc-copy-load-dealloc.
-  ValueHandle tmp = alloc(tmpMemRefType(transfer));
-  IndexedValue local(tmp);
-  ValueHandle vec = vector_type_cast(tmp);
-  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
-    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
-    local(ivs) = remote(clip(transfer, view, ivs));
-  });
-  ValueHandle vectorValue = std_load(vec);
-  (dealloc(tmp)); // vexing parse
-
-  // 3. Propagate.
-  rewriter.replaceOp(op, vectorValue.getValue());
-  return matchSuccess();
-}
-
-/// Lowers VectorTransferWriteOp into a combination of:
-///   1. local memory allocation;
-///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
-///   3. perfect loop nest over:
-///      a. scalar load from local buffers (viewed as a scalar memref);
-///      a. scalar store to original memref (with clipping).
-///   4. local memory deallocation.
-///
-/// More specifically, lowers the data transfer part while ensuring no
-/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
-/// clipping. This means that a given value in memory can be written to multiple
-/// times and concurrently.
-///
-/// See `Important notes about clipping and full-tiles only abstraction` in the
-/// description of `readClipped` above.
-///
-/// TODO(ntv): implement alternatives to clipping.
-/// TODO(ntv): support non-data-parallel operations.
-template <>
-PatternMatchResult
-VectorTransferRewriter<VectorTransferWriteOp>::matchAndRewrite(
-    Operation *op, PatternRewriter &rewriter) const {
-  using namespace mlir::edsc;
-  using namespace mlir::edsc::op;
-  using namespace mlir::edsc::intrinsics;
-  using IndexedValue =
-      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
-
-  VectorTransferWriteOp transfer = cast<VectorTransferWriteOp>(op);
-
-  // 1. Setup all the captures.
-  ScopedContext scope(rewriter, transfer.getLoc());
-  IndexedValue remote(transfer.memref());
-  MemRefView view(transfer.memref());
-  ValueHandle vectorValue(transfer.vector());
-  VectorView vectorView(transfer.vector());
-  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
-  SmallVector<ValueHandle *, 8> pivs = makeIndexHandlePointers(ivs);
-  coalesceCopy(transfer, &pivs, &vectorView);
-
-  auto lbs = vectorView.getLbs();
-  auto ubs = vectorView.getUbs();
-  SmallVector<ValueHandle, 8> steps;
-  steps.reserve(vectorView.getSteps().size());
-  for (auto step : vectorView.getSteps())
-    steps.push_back(constant_index(step));
-
-  // 2. Emit alloc-store-copy-dealloc.
-  ValueHandle tmp = alloc(tmpMemRefType(transfer));
-  IndexedValue local(tmp);
-  ValueHandle vec = vector_type_cast(tmp);
-  std_store(vectorValue, vec);
-  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
-    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
-    remote(clip(transfer, view, ivs)) = local(ivs);
-  });
-  (dealloc(tmp)); // vexing parse...
-
-  rewriter.eraseOp(op);
-  return matchSuccess();
-}
-
-struct LowerVectorTransfersPass
-    : public FunctionPass<LowerVectorTransfersPass> {
-  void runOnFunction() override {
-    OwningRewritePatternList patterns;
-    auto *context = &getContext();
-    patterns.insert<VectorTransferRewriter<vector::VectorTransferReadOp>,
-                    VectorTransferRewriter<vector::VectorTransferWriteOp>>(
-        context);
-    applyPatternsGreedily(getFunction(), patterns);
-  }
-};
-
-} // end anonymous namespace
-
-std::unique_ptr<OpPassBase<FuncOp>> mlir::createLowerVectorTransfersPass() {
-  return std::make_unique<LowerVectorTransfersPass>();
-}
-
-static PassRegistration<LowerVectorTransfersPass>
-    pass("affine-lower-vector-transfers",
-         "Materializes vector transfer ops to a "
-         "proper abstraction for the hardware");
author	Nicolas Vasilache <ntv@google.com>	2019-11-14 15:39:36 -0800
committer	A. Unique TensorFlower <gardener@tensorflow.org>	2019-11-14 15:40:07 -0800
commit	0b271b7dfe285064b8b237d18bfc923212e7a77b (patch)
tree	3783613205ec9f87106a6fa7f730e22f97d91745 /mlir/lib/Transforms
parent	a78bd84cf84c00914f48781fa0c561cbb6bdf847 (diff)
download	bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.tar.gz bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.zip