11 files changed, 428 insertions, 2 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff71980..a21b5148772 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ public:
   /// 'gpu.kernel' attribute.
   static bool isKernel(Operation *op);
 
+  /// Returns the number of workgroup (thread, block) dimensions supported in
+  /// the GPU dialect.
+  // TODO(zinenko,herhut): consider generalizing this.
+  static unsigned getNumWorkgroupDimensions() { return 3; }
+
   /// Returns the numeric value used to identify the workgroup memory address
   /// space.
   static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9b553..766ddbf202c 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
+    /// Adds a workgroup attribution of the MemRef type with the given shape and
+    /// element type.
+    Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
     /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
     /// it is intended to be launched from host.
     bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 00000000000..09c1371708f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148f95e..2d3eb18d729 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ public:
   /// Add one value to the argument list.
   BlockArgument addArgument(Type type);
 
+  /// Insert one value to the position in the argument list indicated by the
+  /// given iterator. The existing arguments are shifted. The block is expected
+  /// not to have predecessors.
+  BlockArgument insertArgument(args_iterator it, Type type);
+
   /// Add one argument to the argument list for each type specified in the list.
   iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
 
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba49ef..dbf05ac6ace 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
   IR/GPUDialect.cpp
   IR/DialectRegistration.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/MemoryPromotion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
 )
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+  MLIRGPUOpsIncGen
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
+target_link_libraries(MLIRGPU
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032fc21..32d7fae65d9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
 // GPUFuncOp
 //===----------------------------------------------------------------------===//
 
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+                                         Type elementType) {
+  unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+  Block &bodyBlock = body().front();
+  Value attribution = bodyBlock.insertArgument(
+      std::next(bodyBlock.args_begin(), pos),
+      MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+                      GPUDialect::getWorkgroupAddressSpace()));
+  auto numWorkgroupBuffersAttr =
+      getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+  setAttr(getNumWorkgroupAttributionsAttrName(),
+          IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+                           numWorkgroupBuffersAttr.getValue() + 1));
+  return attribution;
+}
+
 void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
                       FunctionType type, ArrayRef<Type> workgroupAttributions,
                       ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 00000000000..f01a430a216
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+  if (dim == 0)
+    return "x";
+  if (dim == 1)
+    return "y";
+  if (dim == 2)
+    return "z";
+
+  llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+                            edsc::MemRefView &bounds, Value from, Value to) {
+  // Create EDSC handles for bounds.
+  unsigned rank = bounds.rank();
+  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+  // Make sure we have enough loops to use all thread dimensions, these trivial
+  // loops should be outermost and therefore inserted first.
+  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+    lbs.resize(extraLoops, zero);
+    ubs.resize(extraLoops, one);
+    steps.resize(extraLoops, one);
+  }
+
+  // Add existing bonuds.
+  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+  // Emit constant operations for steps.
+  steps.reserve(lbs.size());
+  llvm::transform(
+      bounds.getSteps(), std::back_inserter(steps),
+      [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+  // Obtain thread identifiers and block sizes, necessary to map to them.
+  auto indexType = builder.getIndexType();
+  SmallVector<Value, 3> threadIds, blockDims;
+  for (unsigned i = 0; i < 3; ++i) {
+    auto dimName = builder.getStringAttr(getDimName(i));
+    threadIds.push_back(
+        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+    blockDims.push_back(
+        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+  }
+
+  // Produce the loop nest with copies.
+  auto ivs = edsc::makeIndexHandles(lbs.size());
+  auto ivPtrs =
+      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+    edsc::StdIndexedValue fromHandle(from), toHandle(to);
+    toHandle(activeIvs) = fromHandle(activeIvs);
+  });
+
+  // Map the innermost loops to threads in reverse order.
+  for (auto en :
+       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+           GPUDialect::getNumWorkgroupDimensions())))) {
+    auto loop = cast<loop::ForOp>(
+        en.value().getValue().getParentRegion()->getParentOp());
+    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+                          {blockDims[en.index()]});
+  }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+///   ^bb(...):
+///     <loop-bound-computation>
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %0 = load %from[%arg0, ..., %argN]
+///           store %0, %to[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///     gpu.barrier
+///     <... original body ...>
+///     gpu.barrier
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %1 = load %to[%arg0, ..., %argN]
+///           store %1, %from[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+  auto fromType = from.getType().cast<MemRefType>();
+  auto toType = to.getType().cast<MemRefType>();
+  (void)fromType;
+  (void)toType;
+  assert(fromType.getShape() == toType.getShape());
+  assert(fromType.getRank() != 0);
+  assert(has_single_element(region) &&
+         "unstructured control flow not supported");
+
+  OpBuilder builder(region.getContext());
+  builder.setInsertionPointToStart(&region.front());
+
+  edsc::ScopedContext edscContext(builder, loc);
+  edsc::MemRefView fromView(from);
+  insertCopyLoops(builder, loc, fromView, from, to);
+  builder.create<gpu::BarrierOp>(loc);
+
+  builder.setInsertionPoint(&region.front().back());
+  builder.create<gpu::BarrierOp>(loc);
+  insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+  Value value = op.getArgument(arg);
+  auto type = value.getType().dyn_cast<MemRefType>();
+  assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+  Value attribution =
+      op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+  // Replace the uses first since only the original uses are currently present.
+  // Then insert the copies.
+  value.replaceAllUsesWith(attribution);
+  insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada9981a8..2757c505555 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
   }
 }
 
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+  assert(llvm::empty(getPredecessors()) &&
+         "cannot insert arguments to blocks with predecessors");
+
+  // Use the args_iterator (on the BlockArgListType) to compute the insertion
+  // iterator in the underlying argument storage.
+  size_t distance = std::distance(args_begin(), it);
+  auto arg = BlockArgument::create(type, this);
+  arguments.insert(std::next(arguments.begin(), distance), arg);
+  return arg;
+}
+
 //===----------------------------------------------------------------------===//
 // Terminator management
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 00000000000..c06174e0fcd
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref3d
+  // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+  gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+    "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref5d
+  // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+  gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c0:.*]] = constant 0
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[c6:.*]] = constant 6
+    // CHECK-DAG: %[[c7:.*]] = constant 7
+    // CHECK-DAG: %[[c8:.*]] = constant 8
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+    "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+    // Verify that loop loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Check that attribution insertion works fine.
+  // CHECK-LABEL: @insert
+  // CHECK-SAME: (%{{.*}}: memref<4xf32>
+  // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+  // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+  // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+  gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+      workgroup(%arg1: memref<1x1xf64, 3>)
+      private(%arg2: memref<1x1xi64, 5>)
+      kernel {
+    // CHECK: "use"(%[[wg2]])
+    "use"(%arg0) : (memref<4xf32>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1d167..ac4a4930e5a 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestLoopFusion.cpp
+  TestGpuMemoryPromotion.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
 target_link_libraries(MLIRTestTransforms
   MLIRAffineOps
   MLIRAnalysis
+  MLIREDSC
+  MLIRGPU
   MLIRLoopOps
   MLIRPass
   MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 00000000000..ee0291827fa
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+    : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+  void runOnOperation() override {
+    gpu::GPUFuncOp op = getOperation();
+    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+      if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+        promoteToWorkgroupMemory(op, i);
+    }
+  }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+    "test-gpu-memory-promotion",
+    "Promotes the annotated arguments of gpu.func to workgroup memory.");