summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mlir/include/mlir/Dialect/GPU/GPUDialect.h5
-rw-r--r--mlir/include/mlir/Dialect/GPU/GPUOps.td4
-rw-r--r--mlir/include/mlir/Dialect/GPU/MemoryPromotion.h29
-rw-r--r--mlir/include/mlir/IR/Block.h5
-rw-r--r--mlir/lib/Dialect/GPU/CMakeLists.txt20
-rw-r--r--mlir/lib/Dialect/GPU/IR/GPUDialect.cpp18
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp173
-rw-r--r--mlir/lib/IR/Block.cpp14
-rw-r--r--mlir/test/Dialect/GPU/promotion.mlir119
-rw-r--r--mlir/test/lib/Transforms/CMakeLists.txt3
-rw-r--r--mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp40
11 files changed, 428 insertions, 2 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff71980..a21b5148772 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ public:
/// 'gpu.kernel' attribute.
static bool isKernel(Operation *op);
+ /// Returns the number of workgroup (thread, block) dimensions supported in
+ /// the GPU dialect.
+ // TODO(zinenko,herhut): consider generalizing this.
+ static unsigned getNumWorkgroupDimensions() { return 3; }
+
/// Returns the numeric value used to identify the workgroup memory address
/// space.
static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9b553..766ddbf202c 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
];
let extraClassDeclaration = [{
+ /// Adds a workgroup attribution of the MemRef type with the given shape and
+ /// element type.
+ Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
/// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
/// it is intended to be launched from host.
bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 00000000000..09c1371708f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148f95e..2d3eb18d729 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ public:
/// Add one value to the argument list.
BlockArgument addArgument(Type type);
+ /// Insert one value to the position in the argument list indicated by the
+ /// given iterator. The existing arguments are shifted. The block is expected
+ /// not to have predecessors.
+ BlockArgument insertArgument(args_iterator it, Type type);
+
/// Add one argument to the argument list for each type specified in the list.
iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba49ef..dbf05ac6ace 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
IR/GPUDialect.cpp
IR/DialectRegistration.cpp
Transforms/KernelOutlining.cpp
+ Transforms/MemoryPromotion.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
)
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+ MLIRGPUOpsIncGen
+ MLIREDSC
+ MLIRIR
+ MLIRLLVMIR
+ MLIRLoopOps
+ MLIRSupport
+ MLIRTransformUtils
+ LLVMSupport)
+target_link_libraries(MLIRGPU
+ MLIREDSC
+ MLIRIR
+ MLIRLLVMIR
+ MLIRLoopOps
+ MLIRSupport
+ MLIRTransformUtils
+ LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032fc21..32d7fae65d9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
// GPUFuncOp
//===----------------------------------------------------------------------===//
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+ Type elementType) {
+ unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+ Block &bodyBlock = body().front();
+ Value attribution = bodyBlock.insertArgument(
+ std::next(bodyBlock.args_begin(), pos),
+ MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+ GPUDialect::getWorkgroupAddressSpace()));
+ auto numWorkgroupBuffersAttr =
+ getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+ setAttr(getNumWorkgroupAttributionsAttrName(),
+ IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+ numWorkgroupBuffersAttr.getValue() + 1));
+ return attribution;
+}
+
void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
FunctionType type, ArrayRef<Type> workgroupAttributions,
ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 00000000000..f01a430a216
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+ if (dim == 0)
+ return "x";
+ if (dim == 1)
+ return "y";
+ if (dim == 2)
+ return "z";
+
+ llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+ edsc::MemRefView &bounds, Value from, Value to) {
+ // Create EDSC handles for bounds.
+ unsigned rank = bounds.rank();
+ SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+ // Make sure we have enough loops to use all thread dimensions, these trivial
+ // loops should be outermost and therefore inserted first.
+ if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+ unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+ edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+ edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+ lbs.resize(extraLoops, zero);
+ ubs.resize(extraLoops, one);
+ steps.resize(extraLoops, one);
+ }
+
+ // Add existing bonuds.
+ lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+ ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+ // Emit constant operations for steps.
+ steps.reserve(lbs.size());
+ llvm::transform(
+ bounds.getSteps(), std::back_inserter(steps),
+ [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+ // Obtain thread identifiers and block sizes, necessary to map to them.
+ auto indexType = builder.getIndexType();
+ SmallVector<Value, 3> threadIds, blockDims;
+ for (unsigned i = 0; i < 3; ++i) {
+ auto dimName = builder.getStringAttr(getDimName(i));
+ threadIds.push_back(
+ builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+ blockDims.push_back(
+ builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+ }
+
+ // Produce the loop nest with copies.
+ auto ivs = edsc::makeIndexHandles(lbs.size());
+ auto ivPtrs =
+ edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+ edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+ auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+ edsc::StdIndexedValue fromHandle(from), toHandle(to);
+ toHandle(activeIvs) = fromHandle(activeIvs);
+ });
+
+ // Map the innermost loops to threads in reverse order.
+ for (auto en :
+ llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+ GPUDialect::getNumWorkgroupDimensions())))) {
+ auto loop = cast<loop::ForOp>(
+ en.value().getValue().getParentRegion()->getParentOp());
+ mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+ {blockDims[en.index()]});
+ }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+/// ^bb(...):
+/// <loop-bound-computation>
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %0 = load %from[%arg0, ..., %argN]
+/// store %0, %to[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+/// gpu.barrier
+/// <... original body ...>
+/// gpu.barrier
+/// for %arg0 = ... to ... step ... {
+/// ...
+/// for %argN = <thread-id-x> to ... step <block-dim-x> {
+/// %1 = load %to[%arg0, ..., %argN]
+/// store %1, %from[%arg0, ..., %argN]
+/// }
+/// ...
+/// }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+ auto fromType = from.getType().cast<MemRefType>();
+ auto toType = to.getType().cast<MemRefType>();
+ (void)fromType;
+ (void)toType;
+ assert(fromType.getShape() == toType.getShape());
+ assert(fromType.getRank() != 0);
+ assert(has_single_element(region) &&
+ "unstructured control flow not supported");
+
+ OpBuilder builder(region.getContext());
+ builder.setInsertionPointToStart(&region.front());
+
+ edsc::ScopedContext edscContext(builder, loc);
+ edsc::MemRefView fromView(from);
+ insertCopyLoops(builder, loc, fromView, from, to);
+ builder.create<gpu::BarrierOp>(loc);
+
+ builder.setInsertionPoint(&region.front().back());
+ builder.create<gpu::BarrierOp>(loc);
+ insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+ Value value = op.getArgument(arg);
+ auto type = value.getType().dyn_cast<MemRefType>();
+ assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+ Value attribution =
+ op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+ // Replace the uses first since only the original uses are currently present.
+ // Then insert the copies.
+ value.replaceAllUsesWith(attribution);
+ insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada9981a8..2757c505555 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
}
}
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+ assert(llvm::empty(getPredecessors()) &&
+ "cannot insert arguments to blocks with predecessors");
+
+ // Use the args_iterator (on the BlockArgListType) to compute the insertion
+ // iterator in the underlying argument storage.
+ size_t distance = std::distance(args_begin(), it);
+ auto arg = BlockArgument::create(type, this);
+ arguments.insert(std::next(arguments.begin(), distance), arg);
+ return arg;
+}
+
//===----------------------------------------------------------------------===//
// Terminator management
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 00000000000..c06174e0fcd
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+ // Verify that the attribution was indeed introduced
+ // CHECK-LABEL: @memref3d
+ // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+ // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+ gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+ // Verify that loop bounds are emitted, the order does not matter.
+ // CHECK-DAG: %[[c1:.*]] = constant 1
+ // CHECK-DAG: %[[c4:.*]] = constant 4
+ // CHECK-DAG: %[[c5:.*]] = constant 5
+ // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+ // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+ // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+ // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+ // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+ // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+ // Verify that loops for the copy are emitted. We only check the number of
+ // loops here since their bounds are produced by mapLoopToProcessorIds,
+ // tested separately.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+
+ // Verify that the copy is emitted and uses only the last two loops.
+ // CHECK: %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+ // CHECK: store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+ // Verify that the use has been rewritten.
+ // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+ "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+ // Verify that loops for the copy are emitted. We only check the number of
+ // loops here since their bounds are produced by mapLoopToProcessorIds,
+ // tested separately.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+
+ // Verify that the copy is emitted and uses only the last two loops.
+ // CHECK: %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+ // CHECK: store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+ gpu.return
+ }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+ // Verify that the attribution was indeed introduced
+ // CHECK-LABEL: @memref5d
+ // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+ // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+ gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+ // Verify that loop bounds are emitted, the order does not matter.
+ // CHECK-DAG: %[[c0:.*]] = constant 0
+ // CHECK-DAG: %[[c1:.*]] = constant 1
+ // CHECK-DAG: %[[c4:.*]] = constant 4
+ // CHECK-DAG: %[[c5:.*]] = constant 5
+ // CHECK-DAG: %[[c6:.*]] = constant 6
+ // CHECK-DAG: %[[c7:.*]] = constant 7
+ // CHECK-DAG: %[[c8:.*]] = constant 8
+ // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+ // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+ // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+ // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+ // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+ // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+ // Verify that loops for the copy are emitted.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+ // CHECK: loop.for %[[i3:.*]] =
+ // CHECK: loop.for %[[i4:.*]] =
+
+ // Verify that the copy is emitted.
+ // CHECK: %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ // CHECK: store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+ // Verify that the use has been rewritten.
+ // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+ "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+ // Verify that loop loops for the copy are emitted.
+ // CHECK: loop.for %[[i0:.*]] =
+ // CHECK: loop.for %[[i1:.*]] =
+ // CHECK: loop.for %[[i2:.*]] =
+ // CHECK: loop.for %[[i3:.*]] =
+ // CHECK: loop.for %[[i4:.*]] =
+
+ // Verify that the copy is emitted.
+ // CHECK: %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ // CHECK: store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+ gpu.return
+ }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+ // Check that attribution insertion works fine.
+ // CHECK-LABEL: @insert
+ // CHECK-SAME: (%{{.*}}: memref<4xf32>
+ // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+ // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+ // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+ gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+ workgroup(%arg1: memref<1x1xf64, 3>)
+ private(%arg2: memref<1x1xi64, 5>)
+ kernel {
+ // CHECK: "use"(%[[wg2]])
+ "use"(%arg0) : (memref<4xf32>) -> ()
+ gpu.return
+ }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1d167..ac4a4930e5a 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
TestCallGraph.cpp
TestConstantFold.cpp
TestLoopFusion.cpp
+ TestGpuMemoryPromotion.cpp
TestInlining.cpp
TestLinalgTransforms.cpp
TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
target_link_libraries(MLIRTestTransforms
MLIRAffineOps
MLIRAnalysis
+ MLIREDSC
+ MLIRGPU
MLIRLoopOps
MLIRPass
MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 00000000000..ee0291827fa
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+ : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+ void runOnOperation() override {
+ gpu::GPUFuncOp op = getOperation();
+ for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+ if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+ promoteToWorkgroupMemory(op, i);
+ }
+ }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+ "test-gpu-memory-promotion",
+ "Promotes the annotated arguments of gpu.func to workgroup memory.");
OpenPOWER on IntegriCloud