1 files changed, 77 insertions, 0 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
index 6d4cb9d8256..21abc3cf99b 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -19,11 +19,14 @@
 #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/Pass/Pass.h"
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/CommandLine.h"
 
 #define PASS_NAME "convert-loops-to-gpu"
+#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
 
 using namespace mlir;
 using namespace mlir::loop;
@@ -38,6 +41,19 @@ static llvm::cl::opt<unsigned> clNumThreadDims(
     llvm::cl::desc("Number of GPU thread dimensions for mapping"),
     llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
 
+static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
+                                                      " options");
+static llvm::cl::list<unsigned>
+    clNumWorkGroups("gpu-num-workgroups",
+                    llvm::cl::desc("Num workgroups in the GPU launch"),
+                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                    llvm::cl::cat(clLoopOpToGPUCategory));
+static llvm::cl::list<unsigned>
+    clWorkGroupSize("gpu-workgroup-size",
+                    llvm::cl::desc("Workgroup Size in the GPU launch"),
+                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                    llvm::cl::cat(clLoopOpToGPUCategory));
+
 namespace {
 // A pass that traverses top-level loops in the function and converts them to
 // GPU launch operations.  Nested launches are not allowed, so this does not
@@ -64,6 +80,50 @@ struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
   unsigned numBlockDims;
   unsigned numThreadDims;
 };
+
+// A pass that traverses top-level loops in the function and convertes them to
+// GPU launch operations. The top-level loops itself does not have to be
+// perfectly nested. The only requirement is that there be as many perfectly
+// nested loops as the size of `numWorkGroups`. Within these any loop nest has
+// to be perfectly nested upto depth equal to size of `workGroupSize`.
+struct ImperfectlyNestedForLoopMapper
+    : public FunctionPass<ImperfectlyNestedForLoopMapper> {
+  ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
+                                 ArrayRef<int64_t> workGroupSize)
+      : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
+        workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
+
+  void runOnFunction() override {
+    // Insert the num work groups and workgroup sizes as constant values. This
+    // pass is only used for testing.
+    FuncOp funcOp = getFunction();
+    OpBuilder builder(funcOp.getOperation()->getRegion(0));
+    SmallVector<Value *, 3> numWorkGroupsVal, workGroupSizeVal;
+    for (auto val : numWorkGroups) {
+      auto constOp = builder.create<ConstantOp>(
+          funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+      numWorkGroupsVal.push_back(constOp);
+    }
+    for (auto val : workGroupSize) {
+      auto constOp = builder.create<ConstantOp>(
+          funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+      workGroupSizeVal.push_back(constOp);
+    }
+    for (Block &block : getFunction()) {
+      for (Operation &op : llvm::make_early_inc_range(block)) {
+        if (auto forOp = dyn_cast<ForOp>(&op)) {
+          if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
+                                            workGroupSizeVal))) {
+            return signalPassFailure();
+          }
+        }
+      }
+    }
+  }
+  SmallVector<int64_t, 3> numWorkGroups;
+  SmallVector<int64_t, 3> workGroupSize;
+};
+
 } // namespace
 
 std::unique_ptr<OpPassBase<FuncOp>>
@@ -72,8 +132,25 @@ mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
   return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
 }
 
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
+                          ArrayRef<int64_t> workGroupSize) {
+  return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+                                                          workGroupSize);
+}
+
 static PassRegistration<ForLoopMapper>
     registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
       return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
                                              clNumThreadDims.getValue());
     });
+
+static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
+    LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
+    [] {
+      SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
+      numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
+      workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
+      return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+                                                              workGroupSize);
+    });