summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp')
-rw-r--r--mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp77
1 files changed, 77 insertions, 0 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
index 6d4cb9d8256..21abc3cf99b 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -19,11 +19,14 @@
#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/CommandLine.h"
#define PASS_NAME "convert-loops-to-gpu"
+#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
using namespace mlir;
using namespace mlir::loop;
@@ -38,6 +41,19 @@ static llvm::cl::opt<unsigned> clNumThreadDims(
llvm::cl::desc("Number of GPU thread dimensions for mapping"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
+static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
+ " options");
+static llvm::cl::list<unsigned>
+ clNumWorkGroups("gpu-num-workgroups",
+ llvm::cl::desc("Num workgroups in the GPU launch"),
+ llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+ llvm::cl::cat(clLoopOpToGPUCategory));
+static llvm::cl::list<unsigned>
+ clWorkGroupSize("gpu-workgroup-size",
+ llvm::cl::desc("Workgroup Size in the GPU launch"),
+ llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+ llvm::cl::cat(clLoopOpToGPUCategory));
+
namespace {
// A pass that traverses top-level loops in the function and converts them to
// GPU launch operations. Nested launches are not allowed, so this does not
@@ -64,6 +80,50 @@ struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
unsigned numBlockDims;
unsigned numThreadDims;
};
+
+// A pass that traverses top-level loops in the function and convertes them to
+// GPU launch operations. The top-level loops itself does not have to be
+// perfectly nested. The only requirement is that there be as many perfectly
+// nested loops as the size of `numWorkGroups`. Within these any loop nest has
+// to be perfectly nested upto depth equal to size of `workGroupSize`.
+struct ImperfectlyNestedForLoopMapper
+ : public FunctionPass<ImperfectlyNestedForLoopMapper> {
+ ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
+ ArrayRef<int64_t> workGroupSize)
+ : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
+ workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
+
+ void runOnFunction() override {
+ // Insert the num work groups and workgroup sizes as constant values. This
+ // pass is only used for testing.
+ FuncOp funcOp = getFunction();
+ OpBuilder builder(funcOp.getOperation()->getRegion(0));
+ SmallVector<Value *, 3> numWorkGroupsVal, workGroupSizeVal;
+ for (auto val : numWorkGroups) {
+ auto constOp = builder.create<ConstantOp>(
+ funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+ numWorkGroupsVal.push_back(constOp);
+ }
+ for (auto val : workGroupSize) {
+ auto constOp = builder.create<ConstantOp>(
+ funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+ workGroupSizeVal.push_back(constOp);
+ }
+ for (Block &block : getFunction()) {
+ for (Operation &op : llvm::make_early_inc_range(block)) {
+ if (auto forOp = dyn_cast<ForOp>(&op)) {
+ if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
+ workGroupSizeVal))) {
+ return signalPassFailure();
+ }
+ }
+ }
+ }
+ }
+ SmallVector<int64_t, 3> numWorkGroups;
+ SmallVector<int64_t, 3> workGroupSize;
+};
+
} // namespace
std::unique_ptr<OpPassBase<FuncOp>>
@@ -72,8 +132,25 @@ mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
}
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
+ ArrayRef<int64_t> workGroupSize) {
+ return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+ workGroupSize);
+}
+
static PassRegistration<ForLoopMapper>
registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
clNumThreadDims.getValue());
});
+
+static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
+ LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
+ [] {
+ SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
+ numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
+ workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
+ return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+ workGroupSize);
+ });
OpenPOWER on IntegriCloud