diff options
author | Alex Zinenko <zinenko@google.com> | 2020-01-07 20:00:54 +0100 |
---|---|---|
committer | Alex Zinenko <zinenko@google.com> | 2020-01-09 10:06:00 +0100 |
commit | 08778d8c4fd8a6519c7f27bfa6b09c47262cb844 (patch) | |
tree | 195cfbe336a349d0406283006228c0f8ed4e4cbb /mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp | |
parent | e93e0d413f3afa1df5c5f88df546bebcd1183155 (diff) | |
download | bcm5719-llvm-08778d8c4fd8a6519c7f27bfa6b09c47262cb844.tar.gz bcm5719-llvm-08778d8c4fd8a6519c7f27bfa6b09c47262cb844.zip |
[mlir][GPU] introduce utilities for promotion to workgroup memory
Introduce a set of function that promote a memref argument of a `gpu.func` to
workgroup memory using memory attribution. The promotion boils down to
additional loops performing the copy from the original argument to the
attributed memory in the beginning of the function, and back at the end of the
function using all available threads. The loop bounds are specified so as to
adapt to any size of the workgroup. These utilities are intended to compose
with other existing utilities (loop coalescing and tiling) in cases where the
distribution of work across threads is uneven, e.g. copying a 2D memref with
only the threads along the "x" dimension. Similarly, specialization of the
kernel to specific launch sizes should be implemented as a separate pass
combining constant propagation and canonicalization.
Introduce a simple attribute-driven pass to test the promotion transformation
since we don't have a heuristic at the moment.
Differential revision: https://reviews.llvm.org/D71904
Diffstat (limited to 'mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp')
-rw-r--r-- | mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp new file mode 100644 index 00000000000..f01a430a216 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp @@ -0,0 +1,173 @@ +//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities that allow one to create IR moving the data +// across different levels of the GPU memory hierarchy. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/MemoryPromotion.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/EDSC/Builders.h" +#include "mlir/EDSC/Helpers.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/Functional.h" +#include "mlir/Transforms/LoopUtils.h" + +using namespace mlir; +using namespace mlir::gpu; + +/// Returns the textual name of a GPU dimension. +static StringRef getDimName(unsigned dim) { + if (dim == 0) + return "x"; + if (dim == 1) + return "y"; + if (dim == 2) + return "z"; + + llvm_unreachable("dimension ID overflow"); +} + +/// Emits the (imperfect) loop nest performing the copy between "from" and "to" +/// values using the bounds derived from the "from" value. Emits at least +/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with +/// single-iteration loops. Maps the innermost loops to thread dimensions, in +/// reverse order to enable access coalescing in the innermost loop. +static void insertCopyLoops(OpBuilder &builder, Location loc, + edsc::MemRefView &bounds, Value from, Value to) { + // Create EDSC handles for bounds. + unsigned rank = bounds.rank(); + SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps; + + // Make sure we have enough loops to use all thread dimensions, these trivial + // loops should be outermost and therefore inserted first. + if (rank < GPUDialect::getNumWorkgroupDimensions()) { + unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; + edsc::ValueHandle zero = edsc::intrinsics::constant_index(0); + edsc::ValueHandle one = edsc::intrinsics::constant_index(1); + lbs.resize(extraLoops, zero); + ubs.resize(extraLoops, one); + steps.resize(extraLoops, one); + } + + // Add existing bonuds. + lbs.append(bounds.getLbs().begin(), bounds.getLbs().end()); + ubs.append(bounds.getUbs().begin(), bounds.getUbs().end()); + + // Emit constant operations for steps. + steps.reserve(lbs.size()); + llvm::transform( + bounds.getSteps(), std::back_inserter(steps), + [](int64_t step) { return edsc::intrinsics::constant_index(step); }); + + // Obtain thread identifiers and block sizes, necessary to map to them. + auto indexType = builder.getIndexType(); + SmallVector<Value, 3> threadIds, blockDims; + for (unsigned i = 0; i < 3; ++i) { + auto dimName = builder.getStringAttr(getDimName(i)); + threadIds.push_back( + builder.create<gpu::ThreadIdOp>(loc, indexType, dimName)); + blockDims.push_back( + builder.create<gpu::BlockDimOp>(loc, indexType, dimName)); + } + + // Produce the loop nest with copies. + auto ivs = edsc::makeIndexHandles(lbs.size()); + auto ivPtrs = + edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs)); + edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() { + auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); + edsc::StdIndexedValue fromHandle(from), toHandle(to); + toHandle(activeIvs) = fromHandle(activeIvs); + }); + + // Map the innermost loops to threads in reverse order. + for (auto en : + llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( + GPUDialect::getNumWorkgroupDimensions())))) { + auto loop = cast<loop::ForOp>( + en.value().getValue().getParentRegion()->getParentOp()); + mapLoopToProcessorIds(loop, {threadIds[en.index()]}, + {blockDims[en.index()]}); + } +} + +/// Emits the loop nests performing the copy to the designated location in the +/// beginning of the region, and from the designated location immediately before +/// the terminator of the first block of the region. The region is expected to +/// have one block. This boils down to the following structure +/// +/// ^bb(...): +/// <loop-bound-computation> +/// for %arg0 = ... to ... step ... { +/// ... +/// for %argN = <thread-id-x> to ... step <block-dim-x> { +/// %0 = load %from[%arg0, ..., %argN] +/// store %0, %to[%arg0, ..., %argN] +/// } +/// ... +/// } +/// gpu.barrier +/// <... original body ...> +/// gpu.barrier +/// for %arg0 = ... to ... step ... { +/// ... +/// for %argN = <thread-id-x> to ... step <block-dim-x> { +/// %1 = load %to[%arg0, ..., %argN] +/// store %1, %from[%arg0, ..., %argN] +/// } +/// ... +/// } +/// +/// Inserts the barriers unconditionally since different threads may be copying +/// values and reading them. An analysis would be required to eliminate barriers +/// in case where value is only used by the thread that copies it. Both copies +/// are inserted unconditionally, an analysis would be required to only copy +/// live-in and live-out values when necessary. This copies the entire memref +/// pointed to by "from". In case a smaller block would be sufficient, the +/// caller can create a subview of the memref and promote it instead. +static void insertCopies(Region ®ion, Location loc, Value from, Value to) { + auto fromType = from.getType().cast<MemRefType>(); + auto toType = to.getType().cast<MemRefType>(); + (void)fromType; + (void)toType; + assert(fromType.getShape() == toType.getShape()); + assert(fromType.getRank() != 0); + assert(has_single_element(region) && + "unstructured control flow not supported"); + + OpBuilder builder(region.getContext()); + builder.setInsertionPointToStart(®ion.front()); + + edsc::ScopedContext edscContext(builder, loc); + edsc::MemRefView fromView(from); + insertCopyLoops(builder, loc, fromView, from, to); + builder.create<gpu::BarrierOp>(loc); + + builder.setInsertionPoint(®ion.front().back()); + builder.create<gpu::BarrierOp>(loc); + insertCopyLoops(builder, loc, fromView, to, from); +} + +/// Promotes a function argument to workgroup memory in the given function. The +/// copies will be inserted in the beginning and in the end of the function. +void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { + Value value = op.getArgument(arg); + auto type = value.getType().dyn_cast<MemRefType>(); + assert(type && type.hasStaticShape() && "can only promote memrefs"); + + Value attribution = + op.addWorkgroupAttribution(type.getShape(), type.getElementType()); + + // Replace the uses first since only the original uses are currently present. + // Then insert the copies. + value.replaceAllUsesWith(attribution); + insertCopies(op.getBody(), op.getLoc(), value, attribution); +} |