1 files changed, 173 insertions, 0 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 00000000000..f01a430a216
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+  if (dim == 0)
+    return "x";
+  if (dim == 1)
+    return "y";
+  if (dim == 2)
+    return "z";
+
+  llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+                            edsc::MemRefView &bounds, Value from, Value to) {
+  // Create EDSC handles for bounds.
+  unsigned rank = bounds.rank();
+  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+  // Make sure we have enough loops to use all thread dimensions, these trivial
+  // loops should be outermost and therefore inserted first.
+  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+    lbs.resize(extraLoops, zero);
+    ubs.resize(extraLoops, one);
+    steps.resize(extraLoops, one);
+  }
+
+  // Add existing bonuds.
+  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+  // Emit constant operations for steps.
+  steps.reserve(lbs.size());
+  llvm::transform(
+      bounds.getSteps(), std::back_inserter(steps),
+      [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+  // Obtain thread identifiers and block sizes, necessary to map to them.
+  auto indexType = builder.getIndexType();
+  SmallVector<Value, 3> threadIds, blockDims;
+  for (unsigned i = 0; i < 3; ++i) {
+    auto dimName = builder.getStringAttr(getDimName(i));
+    threadIds.push_back(
+        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+    blockDims.push_back(
+        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+  }
+
+  // Produce the loop nest with copies.
+  auto ivs = edsc::makeIndexHandles(lbs.size());
+  auto ivPtrs =
+      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+    edsc::StdIndexedValue fromHandle(from), toHandle(to);
+    toHandle(activeIvs) = fromHandle(activeIvs);
+  });
+
+  // Map the innermost loops to threads in reverse order.
+  for (auto en :
+       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+           GPUDialect::getNumWorkgroupDimensions())))) {
+    auto loop = cast<loop::ForOp>(
+        en.value().getValue().getParentRegion()->getParentOp());
+    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+                          {blockDims[en.index()]});
+  }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+///   ^bb(...):
+///     <loop-bound-computation>
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %0 = load %from[%arg0, ..., %argN]
+///           store %0, %to[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///     gpu.barrier
+///     <... original body ...>
+///     gpu.barrier
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %1 = load %to[%arg0, ..., %argN]
+///           store %1, %from[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+  auto fromType = from.getType().cast<MemRefType>();
+  auto toType = to.getType().cast<MemRefType>();
+  (void)fromType;
+  (void)toType;
+  assert(fromType.getShape() == toType.getShape());
+  assert(fromType.getRank() != 0);
+  assert(has_single_element(region) &&
+         "unstructured control flow not supported");
+
+  OpBuilder builder(region.getContext());
+  builder.setInsertionPointToStart(&region.front());
+
+  edsc::ScopedContext edscContext(builder, loc);
+  edsc::MemRefView fromView(from);
+  insertCopyLoops(builder, loc, fromView, from, to);
+  builder.create<gpu::BarrierOp>(loc);
+
+  builder.setInsertionPoint(&region.front().back());
+  builder.create<gpu::BarrierOp>(loc);
+  insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+  Value value = op.getArgument(arg);
+  auto type = value.getType().dyn_cast<MemRefType>();
+  assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+  Value attribution =
+      op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+  // Replace the uses first since only the original uses are currently present.
+  // Then insert the copies.
+  value.replaceAllUsesWith(attribution);
+  insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}