summaryrefslogtreecommitdiffstats
path: root/mlir
diff options
context:
space:
mode:
authorAlex Zinenko <zinenko@google.com>2020-01-07 20:00:54 +0100
committerAlex Zinenko <zinenko@google.com>2020-01-09 10:06:00 +0100
commit08778d8c4fd8a6519c7f27bfa6b09c47262cb844 (patch)
tree195cfbe336a349d0406283006228c0f8ed4e4cbb /mlir
parente93e0d413f3afa1df5c5f88df546bebcd1183155 (diff)
downloadbcm5719-llvm-08778d8c4fd8a6519c7f27bfa6b09c47262cb844.tar.gz
bcm5719-llvm-08778d8c4fd8a6519c7f27bfa6b09c47262cb844.zip
[mlir][GPU] introduce utilities for promotion to workgroup memory
Introduce a set of function that promote a memref argument of a `gpu.func` to workgroup memory using memory attribution. The promotion boils down to additional loops performing the copy from the original argument to the attributed memory in the beginning of the function, and back at the end of the function using all available threads. The loop bounds are specified so as to adapt to any size of the workgroup. These utilities are intended to compose with other existing utilities (loop coalescing and tiling) in cases where the distribution of work across threads is uneven, e.g. copying a 2D memref with only the threads along the "x" dimension. Similarly, specialization of the kernel to specific launch sizes should be implemented as a separate pass combining constant propagation and canonicalization. Introduce a simple attribute-driven pass to test the promotion transformation since we don't have a heuristic at the moment. Differential revision: https://reviews.llvm.org/D71904
Diffstat (limited to 'mlir')
-rw-r--r--mlir/include/mlir/Dialect/GPU/GPUDialect.h5
-rw-r--r--mlir/include/mlir/Dialect/GPU/GPUOps.td4
-rw-r--r--mlir/include/mlir/Dialect/GPU/MemoryPromotion.h29
-rw-r--r--mlir/include/mlir/IR/Block.h5
-rw-r--r--mlir/lib/Dialect/GPU/CMakeLists.txt20
-rw-r--r--mlir/lib/Dialect/GPU/IR/GPUDialect.cpp18
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp173
-rw-r--r--mlir/lib/IR/Block.cpp14
-rw-r--r--mlir/test/Dialect/GPU/promotion.mlir119
-rw-r--r--mlir/test/lib/Transforms/CMakeLists.txt3
-rw-r--r--mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp40
11 files changed, 428 insertions, 2 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff71980..a21b5148772 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ public:
/// 'gpu.kernel' attribute.
static bool isKernel(Operation *op);
+ /// Returns the number of workgroup (thread, block) dimensions supported in
+ /// the GPU dialect.
+ // TODO(zinenko,herhut): consider generalizing this.
+ static unsigned getNumWorkgroupDimensions() { return 3; }
+
/// Returns the numeric value used to identify the workgroup memory address
/// space.
static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9b553..766ddbf202c 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
];
let extraClassDeclaration = [{
+ /// Adds a workgroup attribution of the MemRef type with the given shape and
+ /// element type.
+ Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
/// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
/// it is intended to be launched from host.
bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 00000000000..09c1371708f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148f95e..2d3eb18d729 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ public:
/// Add one value to the argument list.
BlockArgument addArgument(Type type);
+ /// Insert one value to the position in the argument list indicated by the
+ /// given iterator. The existing arguments are shifted. The block is expected
+ /// not to have predecessors.
+ BlockArgument insertArgument(args_iterator it, Type type);
+
/// Add one argument to the argument list for each type specified in the list.
iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba49ef..dbf05ac6ace 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
IR/GPUDialect.cpp
IR/DialectRegistration.cpp
Transforms/KernelOutlining.cpp
+ Transforms/MemoryPromotion.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
)
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+ MLIRGPUOpsIncGen
+ MLIREDSC
+ MLIRIR
+ MLIRLLVMIR
+ MLIRLoopOps
+ MLIRSupport
+ MLIRTransformUtils
+ LLVMSupport)
+target_link_libraries(MLIRGPU
+ MLIREDSC
+ MLIRIR
+ MLIRLLVMIR
+ MLIRLoopOps
+ MLIRSupport
+ MLIRTransformUtils
+ LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032fc21..32d7fae65d9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
// GPUFuncOp
//===----------------------------------------------------------------------===//
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+ Type elementType) {
+ unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+ Block &bodyBlock = body().front();
+ Value attribution = bodyBlock.insertArgument(
+ std::next(bodyBlock.args_begin(), pos),
+ MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+ GPUDialect::getWorkgroupAddressSpace()));
+ auto numWorkgroupBuffersAttr =
+ getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+ setAttr(getNumWorkgroupAttributionsAttrName(),
+ IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+ numWorkgroupBuffersAttr.getValue() + 1));
+ return attribution;
+}
+
void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
FunctionType type, ArrayRef<Type> workgroupAttributions,
ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 00000000000..f01a430a216
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+ if (dim == 0)
+ return "x";
+ if (dim == 1)
+ return "y";
+ if (dim == 2)
+ return "z";
+
+ llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+ edsc::MemRefView &bounds, Value from, Value to) {
+ // Create EDSC handles for bounds.
+ unsigned rank = bounds.rank();
+ SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+ // Make sure we have enough loops to use all thread dimensions, these trivial
+ // loops should be outermost and therefore inserted first.
+ if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+ unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+ edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+ edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+ lbs.resize(extraLoops, zero);
+ ubs.resize(extraLoops, one);
+ steps.resize(extraLoops, one);
+ }
+
+ // Add existing bonuds.
+ lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+ ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+ // Emit constant operations for steps.
+ steps.reserve(lbs.size());
+ llvm::transform(
+ bounds.getSteps(), std::back_inserter(steps),
+ [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+ // Obtain thread identifiers and block sizes, necessary to map to them.
+ auto indexType = builder.getIndexType();
+ SmallVector<Value, 3> threadIds, blockDims;
+ for (unsigned i = 0; i < 3; ++i) {
+ auto dimName = builder.getStringAttr(getDimName(i));
+ threadIds.push_back(
+ builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+ blockDims.push_back(
+ builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+ }
+
+ // Produce the loop nest with copies.
+ auto ivs = edsc::makeIndexHandles(lbs.size());
+ auto ivPtrs =
+ edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+ edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+ auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+ edsc::StdIndexedValue fromHandle(from), toHandle(to);
+ toHandle(activeIvs) = fromHandle(activeIvs);
+ });
+
+ // Map the innermost loops to threads in reverse order.
+ for (auto en :
+ llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+ GPUDialect::getNumWorkgroupDimensions())))) {
+ auto loop = cast<loop::ForOp>(
+ en.value().getValue().getParentRegion()->getParentOp());
+ mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+ {blockDims[en.index()]});
+ }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+/// ^bb(...):
+/// <loop-bound-computation>
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %0 = load %from[%arg0, ..., %argN]
+/// store %0, %to[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+/// gpu.barrier
+/// <... original body ...>
+/// gpu.barrier
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %1 = load %to[%arg0, ..., %argN]
+/// store %1, %from[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+ auto fromType = from.getType().cast<MemRefType>();
+ auto toType = to.getType().cast<MemRefType>();
+ (void)fromType;
+ (void)toType;
+ assert(fromType.getShape() == toType.getShape());
+ assert(fromType.getRank() != 0);
+ assert(has_single_element(region) &&
+ "unstructured control flow not supported");
+
+ OpBuilder builder(region.getContext());
+ builder.setInsertionPointToStart(&region.front());
+
+ edsc::ScopedContext edscContext(builder, loc);
+ edsc::MemRefView fromView(from);
+ insertCopyLoops(builder, loc, fromView, from, to);
+ builder.create<gpu::BarrierOp>(loc);
+
+ builder.setInsertionPoint(&region.front().back());
+ builder.create<gpu::BarrierOp>(loc);
+ insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+ Value value = op.getArgument(arg);
+ auto type = value.getType().dyn_cast<MemRefType>();
+ assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+ Value attribution =
+ op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+ // Replace the uses first since only the original uses are currently present.
+ // Then insert the copies.
+ value.replaceAllUsesWith(attribution);
+ insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada9981a8..2757c505555 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
}
}
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+ assert(llvm::empty(getPredecessors()) &&
+ "cannot insert arguments to blocks with predecessors");
+
+ // Use the args_iterator (on the BlockArgListType) to compute the insertion
+ // iterator in the underlying argument storage.
+ size_t distance = std::distance(args_begin(), it);
+ auto arg = BlockArgument::create(type, this);
+ arguments.insert(std::next(arguments.begin(), distance), arg);
+ return arg;
+}
+
//===----------------------------------------------------------------------===//
// Terminator management
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 00000000000..c06174e0fcd
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+ // Verify that the attribution was indeed introduced
+ // CHECK-LABEL: @memref3d
+ // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+ // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+ gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+ // Verify that loop bounds are emitted, the order does not matter.
+ // CHECK-DAG: %[[c1:.*]] = constant 1
+ // CHECK-DAG: %[[c4:.*]] = constant 4
+ // CHECK-DAG: %[[c5:.*]] = constant 5
+ // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+ // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+ // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+ // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+ // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+ // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+ // Verify that loops for the copy are emitted. We only check the number of
+ // loops here since their bounds are produced by mapLoopToProcessorIds,
+ // tested separately.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+
+ // Verify that the copy is emitted and uses only the last two loops.
+ // CHECK: %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+ // CHECK: store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+ // Verify that the use has been rewritten.
+ // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+ "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+ // Verify that loops for the copy are emitted. We only check the number of
+ // loops here since their bounds are produced by mapLoopToProcessorIds,
+ // tested separately.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+
+ // Verify that the copy is emitted and uses only the last two loops.
+ // CHECK: %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+ // CHECK: store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+ gpu.return
+ }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+ // Verify that the attribution was indeed introduced
+ // CHECK-LABEL: @memref5d
+ // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+ // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+ gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+ // Verify that loop bounds are emitted, the order does not matter.
+ // CHECK-DAG: %[[c0:.*]] = constant 0
+ // CHECK-DAG: %[[c1:.*]] = constant 1
+ // CHECK-DAG: %[[c4:.*]] = constant 4
+ // CHECK-DAG: %[[c5:.*]] = constant 5
+ // CHECK-DAG: %[[c6:.*]] = constant 6
+ // CHECK-DAG: %[[c7:.*]] = constant 7
+ // CHECK-DAG: %[[c8:.*]] = constant 8
+ // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+ // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+ // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+ // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+ // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+ // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+ // Verify that loops for the copy are emitted.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+ // CHECK: loop.for %[[i3:.*]] =
+ // CHECK: loop.for %[[i4:.*]] =
+
+ // Verify that the copy is emitted.
+ // CHECK: %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ // CHECK: store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+ // Verify that the use has been rewritten.
+ // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+ "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+ // Verify that loop loops for the copy are emitted.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+ // CHECK: loop.for %[[i3:.*]] =
+ // CHECK: loop.for %[[i4:.*]] =
+
+ // Verify that the copy is emitted.
+ // CHECK: %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ // CHECK: store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ gpu.return
+ }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+ // Check that attribution insertion works fine.
+ // CHECK-LABEL: @insert
+ // CHECK-SAME: (%{{.*}}: memref<4xf32>
+ // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+ // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+ // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+ gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+ workgroup(%arg1: memref<1x1xf64, 3>)
+ private(%arg2: memref<1x1xi64, 5>)
+ kernel {
+ // CHECK: "use"(%[[wg2]])
+ "use"(%arg0) : (memref<4xf32>) -> ()
+ gpu.return
+ }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1d167..ac4a4930e5a 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
TestCallGraph.cpp
TestConstantFold.cpp
TestLoopFusion.cpp
+ TestGpuMemoryPromotion.cpp
TestInlining.cpp
TestLinalgTransforms.cpp
TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
target_link_libraries(MLIRTestTransforms
MLIRAffineOps
MLIRAnalysis
+ MLIREDSC
+ MLIRGPU
MLIRLoopOps
MLIRPass
MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 00000000000..ee0291827fa
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+ : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+ void runOnOperation() override {
+ gpu::GPUFuncOp op = getOperation();
+ for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+ if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+ promoteToWorkgroupMemory(op, i);
+ }
+ }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+ "test-gpu-memory-promotion",
+ "Promotes the annotated arguments of gpu.func to workgroup memory.");
OpenPOWER on IntegriCloud