summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp')
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp173
1 files changed, 173 insertions, 0 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 00000000000..f01a430a216
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+ if (dim == 0)
+ return "x";
+ if (dim == 1)
+ return "y";
+ if (dim == 2)
+ return "z";
+
+ llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+ edsc::MemRefView &bounds, Value from, Value to) {
+ // Create EDSC handles for bounds.
+ unsigned rank = bounds.rank();
+ SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+ // Make sure we have enough loops to use all thread dimensions, these trivial
+ // loops should be outermost and therefore inserted first.
+ if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+ unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+ edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+ edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+ lbs.resize(extraLoops, zero);
+ ubs.resize(extraLoops, one);
+ steps.resize(extraLoops, one);
+ }
+
+ // Add existing bonuds.
+ lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+ ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+ // Emit constant operations for steps.
+ steps.reserve(lbs.size());
+ llvm::transform(
+ bounds.getSteps(), std::back_inserter(steps),
+ [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+ // Obtain thread identifiers and block sizes, necessary to map to them.
+ auto indexType = builder.getIndexType();
+ SmallVector<Value, 3> threadIds, blockDims;
+ for (unsigned i = 0; i < 3; ++i) {
+ auto dimName = builder.getStringAttr(getDimName(i));
+ threadIds.push_back(
+ builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+ blockDims.push_back(
+ builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+ }
+
+ // Produce the loop nest with copies.
+ auto ivs = edsc::makeIndexHandles(lbs.size());
+ auto ivPtrs =
+ edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+ edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+ auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+ edsc::StdIndexedValue fromHandle(from), toHandle(to);
+ toHandle(activeIvs) = fromHandle(activeIvs);
+ });
+
+ // Map the innermost loops to threads in reverse order.
+ for (auto en :
+ llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+ GPUDialect::getNumWorkgroupDimensions())))) {
+ auto loop = cast<loop::ForOp>(
+ en.value().getValue().getParentRegion()->getParentOp());
+ mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+ {blockDims[en.index()]});
+ }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+/// ^bb(...):
+/// <loop-bound-computation>
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %0 = load %from[%arg0, ..., %argN]
+/// store %0, %to[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+/// gpu.barrier
+/// <... original body ...>
+/// gpu.barrier
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %1 = load %to[%arg0, ..., %argN]
+/// store %1, %from[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+ auto fromType = from.getType().cast<MemRefType>();
+ auto toType = to.getType().cast<MemRefType>();
+ (void)fromType;
+ (void)toType;
+ assert(fromType.getShape() == toType.getShape());
+ assert(fromType.getRank() != 0);
+ assert(has_single_element(region) &&
+ "unstructured control flow not supported");
+
+ OpBuilder builder(region.getContext());
+ builder.setInsertionPointToStart(&region.front());
+
+ edsc::ScopedContext edscContext(builder, loc);
+ edsc::MemRefView fromView(from);
+ insertCopyLoops(builder, loc, fromView, from, to);
+ builder.create<gpu::BarrierOp>(loc);
+
+ builder.setInsertionPoint(&region.front().back());
+ builder.create<gpu::BarrierOp>(loc);
+ insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+ Value value = op.getArgument(arg);
+ auto type = value.getType().dyn_cast<MemRefType>();
+ assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+ Value attribution =
+ op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+ // Replace the uses first since only the original uses are currently present.
+ // Then insert the copies.
+ value.replaceAllUsesWith(attribution);
+ insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
OpenPOWER on IntegriCloud