diff options
| author | Nicolas Vasilache <ntv@google.com> | 2019-11-14 15:39:36 -0800 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-11-14 15:40:07 -0800 |
| commit | 0b271b7dfe285064b8b237d18bfc923212e7a77b (patch) | |
| tree | 3783613205ec9f87106a6fa7f730e22f97d91745 /mlir/lib/Transforms | |
| parent | a78bd84cf84c00914f48781fa0c561cbb6bdf847 (diff) | |
| download | bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.tar.gz bcm5719-llvm-0b271b7dfe285064b8b237d18bfc923212e7a77b.zip | |
Refactor the LowerVectorTransfers pass to use the RewritePattern infra - NFC
This is step 1/n in refactoring infrastructure along the Vector dialect to make it ready for retargetability and composable progressive lowering.
PiperOrigin-RevId: 280529784
Diffstat (limited to 'mlir/lib/Transforms')
| -rw-r--r-- | mlir/lib/Transforms/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | mlir/lib/Transforms/LowerVectorTransfers.cpp | 384 |
2 files changed, 0 insertions, 385 deletions
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index de7801bf215..304e0547edb 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -13,7 +13,6 @@ add_llvm_library(MLIRTransforms LoopTiling.cpp LoopUnrollAndJam.cpp LoopUnroll.cpp - LowerVectorTransfers.cpp MaterializeVectors.cpp MemRefDataFlowOpt.cpp PipelineDataTransfer.cpp diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp deleted file mode 100644 index 57dd18dac0f..00000000000 --- a/mlir/lib/Transforms/LowerVectorTransfers.cpp +++ /dev/null @@ -1,384 +0,0 @@ -//===- LowerVectorTransfers.cpp - LowerVectorTransfers Pass Impl ----------===// -// -// Copyright 2019 The MLIR Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= -// -// This file implements target-dependent lowering of vector transfer operations. -// -//===----------------------------------------------------------------------===// - -#include <type_traits> - -#include "mlir/Analysis/AffineAnalysis.h" -#include "mlir/Analysis/NestedMatcher.h" -#include "mlir/Analysis/Utils.h" -#include "mlir/Analysis/VectorAnalysis.h" -#include "mlir/Dialect/LoopOps/LoopOps.h" -#include "mlir/Dialect/StandardOps/Ops.h" -#include "mlir/Dialect/VectorOps/VectorOps.h" -#include "mlir/EDSC/Builders.h" -#include "mlir/EDSC/Helpers.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/AffineMap.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Location.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/OperationSupport.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/Types.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/Functional.h" -#include "mlir/Transforms/Passes.h" - -/// Implements lowering of VectorTransferReadOp and VectorTransferWriteOp to a -/// proper abstraction for the hardware. -/// -/// For now, we only emit a simple loop nest that performs clipped pointwise -/// copies from a remote to a locally allocated memory. -/// -/// Consider the case: -/// -/// ```mlir {.mlir} -/// // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into -/// // vector<32x256xf32> and pad with %f0 to handle the boundary case: -/// %f0 = constant 0.0f : f32 -/// loop.for %i0 = 0 to %0 { -/// loop.for %i1 = 0 to %1 step %c256 { -/// loop.for %i2 = 0 to %2 step %c32 { -/// %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0) -/// {permutation_map: (d0, d1, d2) -> (d2, d1)} : -/// memref<?x?x?xf32>, vector<32x256xf32> -/// }}} -/// ``` -/// -/// The rewriters construct loop and indices that access MemRef A in a pattern -/// resembling the following (while guaranteeing an always full-tile -/// abstraction): -/// -/// ```mlir {.mlir} -/// loop.for %d2 = 0 to %c256 { -/// loop.for %d1 = 0 to %c32 { -/// %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32 -/// %tmp[%d2, %d1] = %s -/// } -/// } -/// ``` -/// -/// In the current state, only a clipping transfer is implemented by `clip`, -/// which creates individual indexing expressions of the form: -/// -/// ```mlir-dsc -/// SELECT(i + ii < zero, zero, SELECT(i + ii < N, i + ii, N - one)) -/// ``` - -using namespace mlir; -using vector::VectorTransferReadOp; -using vector::VectorTransferWriteOp; - -#define DEBUG_TYPE "affine-lower-vector-transfers" - -namespace { - -/// Lowers VectorTransferOp into a combination of: -/// 1. local memory allocation; -/// 2. perfect loop nest over: -/// a. scalar load/stores from local buffers (viewed as a scalar memref); -/// a. scalar store/load to original memref (with clipping). -/// 3. vector_load/store -/// 4. local memory deallocation. -/// Minor variations occur depending on whether a VectorTransferReadOp or -/// a VectorTransferWriteOp is rewritten. -template <typename VectorTransferOpTy> -struct VectorTransferRewriter : public RewritePattern { - explicit VectorTransferRewriter(MLIRContext *context) - : RewritePattern(VectorTransferOpTy::getOperationName(), 1, context) {} - - /// Used for staging the transfer in a local scalar buffer. - MemRefType tmpMemRefType(VectorTransferOpTy transfer) const { - auto vectorType = transfer.getVectorType(); - return MemRefType::get(vectorType.getShape(), vectorType.getElementType(), - {}, 0); - } - - /// Performs the rewrite. - PatternMatchResult matchAndRewrite(Operation *op, - PatternRewriter &rewriter) const override; -}; - -/// Analyzes the `transfer` to find an access dimension along the fastest remote -/// MemRef dimension. If such a dimension with coalescing properties is found, -/// `pivs` and `vectorView` are swapped so that the invocation of -/// LoopNestBuilder captures it in the innermost loop. -template <typename VectorTransferOpTy> -void coalesceCopy(VectorTransferOpTy transfer, - SmallVectorImpl<edsc::ValueHandle *> *pivs, - edsc::VectorView *vectorView) { - // rank of the remote memory access, coalescing behavior occurs on the - // innermost memory dimension. - auto remoteRank = transfer.getMemRefType().getRank(); - // Iterate over the results expressions of the permutation map to determine - // the loop order for creating pointwise copies between remote and local - // memories. - int coalescedIdx = -1; - auto exprs = transfer.permutation_map().getResults(); - for (auto en : llvm::enumerate(exprs)) { - auto dim = en.value().template dyn_cast<AffineDimExpr>(); - if (!dim) { - continue; - } - auto memRefDim = dim.getPosition(); - if (memRefDim == remoteRank - 1) { - // memRefDim has coalescing properties, it should be swapped in the last - // position. - assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); - coalescedIdx = en.index(); - } - } - if (coalescedIdx >= 0) { - std::swap(pivs->back(), (*pivs)[coalescedIdx]); - vectorView->swapRanges(pivs->size() - 1, coalescedIdx); - } -} - -/// Emits remote memory accesses that are clipped to the boundaries of the -/// MemRef. -template <typename VectorTransferOpTy> -llvm::SmallVector<edsc::ValueHandle, 8> clip(VectorTransferOpTy transfer, - edsc::MemRefView &view, - ArrayRef<edsc::IndexHandle> ivs) { - using namespace mlir::edsc; - using namespace edsc::op; - using edsc::intrinsics::select; - - IndexHandle zero(index_t(0)), one(index_t(1)); - llvm::SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.indices()); - llvm::SmallVector<edsc::ValueHandle, 8> clippedScalarAccessExprs( - memRefAccess.size(), edsc::IndexHandle()); - - // Indices accessing to remote memory are clipped and their expressions are - // returned in clippedScalarAccessExprs. - for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size(); - ++memRefDim) { - // Linear search on a small number of entries. - int loopIndex = -1; - auto exprs = transfer.permutation_map().getResults(); - for (auto en : llvm::enumerate(exprs)) { - auto expr = en.value(); - auto dim = expr.template dyn_cast<AffineDimExpr>(); - // Sanity check. - assert( - (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) && - "Expected dim or 0 in permutationMap"); - if (dim && memRefDim == dim.getPosition()) { - loopIndex = en.index(); - break; - } - } - - // We cannot distinguish atm between unrolled dimensions that implement - // the "always full" tile abstraction and need clipping from the other - // ones. So we conservatively clip everything. - auto N = view.ub(memRefDim); - auto i = memRefAccess[memRefDim]; - if (loopIndex < 0) { - auto N_minus_1 = N - one; - auto select_1 = select(i < N, i, N_minus_1); - clippedScalarAccessExprs[memRefDim] = select(i < zero, zero, select_1); - } else { - auto ii = ivs[loopIndex]; - auto i_plus_ii = i + ii; - auto N_minus_1 = N - one; - auto select_1 = select(i_plus_ii < N, i_plus_ii, N_minus_1); - clippedScalarAccessExprs[memRefDim] = - select(i_plus_ii < zero, zero, select_1); - } - } - - return clippedScalarAccessExprs; -} - -/// Lowers VectorTransferReadOp into a combination of: -/// 1. local memory allocation; -/// 2. perfect loop nest over: -/// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (with clipping). -/// 3. vector_load from local buffer (viewed as a memref<1 x vector>); -/// 4. local memory deallocation. -/// -/// Lowers the data transfer part of a VectorTransferReadOp while ensuring no -/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by -/// clipping. This means that a given value in memory can be read multiple -/// times and concurrently. -/// -/// Important notes about clipping and "full-tiles only" abstraction: -/// ================================================================= -/// When using clipping for dealing with boundary conditions, the same edge -/// value will appear multiple times (a.k.a edge padding). This is fine if the -/// subsequent vector operations are all data-parallel but **is generally -/// incorrect** in the presence of reductions or extract operations. -/// -/// More generally, clipping is a scalar abstraction that is expected to work -/// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs. -/// To deal with real vector_load and DMAs, a "padded allocation + view" -/// abstraction with the ability to read out-of-memref-bounds (but still within -/// the allocated region) is necessary. -/// -/// Whether using scalar loops or vector_load/DMAs to perform the transfer, -/// junk values will be materialized in the vectors and generally need to be -/// filtered out and replaced by the "neutral element". This neutral element is -/// op-dependent so, in the future, we expect to create a vector filter and -/// apply it to a splatted constant vector with the proper neutral element at -/// each ssa-use. This filtering is not necessary for pure data-parallel -/// operations. -/// -/// In the case of vector_store/DMAs, Read-Modify-Write will be required, which -/// also have concurrency implications. Note that by using clipped scalar stores -/// in the presence of data-parallel only operations, we generate code that -/// writes the same value multiple time on the edge locations. -/// -/// TODO(ntv): implement alternatives to clipping. -/// TODO(ntv): support non-data-parallel operations. - -/// Performs the rewrite. -template <> -PatternMatchResult -VectorTransferRewriter<VectorTransferReadOp>::matchAndRewrite( - Operation *op, PatternRewriter &rewriter) const { - using namespace mlir::edsc; - using namespace mlir::edsc::op; - using namespace mlir::edsc::intrinsics; - using IndexedValue = - TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>; - - VectorTransferReadOp transfer = cast<VectorTransferReadOp>(op); - - // 1. Setup all the captures. - ScopedContext scope(rewriter, transfer.getLoc()); - IndexedValue remote(transfer.memref()); - MemRefView view(transfer.memref()); - VectorView vectorView(transfer.vector()); - SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank()); - SmallVector<ValueHandle *, 8> pivs = - makeIndexHandlePointers(MutableArrayRef<IndexHandle>(ivs)); - coalesceCopy(transfer, &pivs, &vectorView); - - auto lbs = vectorView.getLbs(); - auto ubs = vectorView.getUbs(); - SmallVector<ValueHandle, 8> steps; - steps.reserve(vectorView.getSteps().size()); - for (auto step : vectorView.getSteps()) - steps.push_back(constant_index(step)); - - // 2. Emit alloc-copy-load-dealloc. - ValueHandle tmp = alloc(tmpMemRefType(transfer)); - IndexedValue local(tmp); - ValueHandle vec = vector_type_cast(tmp); - LoopNestBuilder(pivs, lbs, ubs, steps)([&] { - // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). - local(ivs) = remote(clip(transfer, view, ivs)); - }); - ValueHandle vectorValue = std_load(vec); - (dealloc(tmp)); // vexing parse - - // 3. Propagate. - rewriter.replaceOp(op, vectorValue.getValue()); - return matchSuccess(); -} - -/// Lowers VectorTransferWriteOp into a combination of: -/// 1. local memory allocation; -/// 2. vector_store to local buffer (viewed as a memref<1 x vector>); -/// 3. perfect loop nest over: -/// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (with clipping). -/// 4. local memory deallocation. -/// -/// More specifically, lowers the data transfer part while ensuring no -/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by -/// clipping. This means that a given value in memory can be written to multiple -/// times and concurrently. -/// -/// See `Important notes about clipping and full-tiles only abstraction` in the -/// description of `readClipped` above. -/// -/// TODO(ntv): implement alternatives to clipping. -/// TODO(ntv): support non-data-parallel operations. -template <> -PatternMatchResult -VectorTransferRewriter<VectorTransferWriteOp>::matchAndRewrite( - Operation *op, PatternRewriter &rewriter) const { - using namespace mlir::edsc; - using namespace mlir::edsc::op; - using namespace mlir::edsc::intrinsics; - using IndexedValue = - TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>; - - VectorTransferWriteOp transfer = cast<VectorTransferWriteOp>(op); - - // 1. Setup all the captures. - ScopedContext scope(rewriter, transfer.getLoc()); - IndexedValue remote(transfer.memref()); - MemRefView view(transfer.memref()); - ValueHandle vectorValue(transfer.vector()); - VectorView vectorView(transfer.vector()); - SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank()); - SmallVector<ValueHandle *, 8> pivs = makeIndexHandlePointers(ivs); - coalesceCopy(transfer, &pivs, &vectorView); - - auto lbs = vectorView.getLbs(); - auto ubs = vectorView.getUbs(); - SmallVector<ValueHandle, 8> steps; - steps.reserve(vectorView.getSteps().size()); - for (auto step : vectorView.getSteps()) - steps.push_back(constant_index(step)); - - // 2. Emit alloc-store-copy-dealloc. - ValueHandle tmp = alloc(tmpMemRefType(transfer)); - IndexedValue local(tmp); - ValueHandle vec = vector_type_cast(tmp); - std_store(vectorValue, vec); - LoopNestBuilder(pivs, lbs, ubs, steps)([&] { - // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). - remote(clip(transfer, view, ivs)) = local(ivs); - }); - (dealloc(tmp)); // vexing parse... - - rewriter.eraseOp(op); - return matchSuccess(); -} - -struct LowerVectorTransfersPass - : public FunctionPass<LowerVectorTransfersPass> { - void runOnFunction() override { - OwningRewritePatternList patterns; - auto *context = &getContext(); - patterns.insert<VectorTransferRewriter<vector::VectorTransferReadOp>, - VectorTransferRewriter<vector::VectorTransferWriteOp>>( - context); - applyPatternsGreedily(getFunction(), patterns); - } -}; - -} // end anonymous namespace - -std::unique_ptr<OpPassBase<FuncOp>> mlir::createLowerVectorTransfersPass() { - return std::make_unique<LowerVectorTransfersPass>(); -} - -static PassRegistration<LowerVectorTransfersPass> - pass("affine-lower-vector-transfers", - "Materializes vector transfer ops to a " - "proper abstraction for the hardware"); |

