diff options
Diffstat (limited to 'mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp')
-rw-r--r-- | mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp index 6d4cb9d8256..21abc3cf99b 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp @@ -19,11 +19,14 @@ #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/Pass/Pass.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Support/CommandLine.h" #define PASS_NAME "convert-loops-to-gpu" +#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu" using namespace mlir; using namespace mlir::loop; @@ -38,6 +41,19 @@ static llvm::cl::opt<unsigned> clNumThreadDims( llvm::cl::desc("Number of GPU thread dimensions for mapping"), llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); +static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME + " options"); +static llvm::cl::list<unsigned> + clNumWorkGroups("gpu-num-workgroups", + llvm::cl::desc("Num workgroups in the GPU launch"), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, + llvm::cl::cat(clLoopOpToGPUCategory)); +static llvm::cl::list<unsigned> + clWorkGroupSize("gpu-workgroup-size", + llvm::cl::desc("Workgroup Size in the GPU launch"), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, + llvm::cl::cat(clLoopOpToGPUCategory)); + namespace { // A pass that traverses top-level loops in the function and converts them to // GPU launch operations. Nested launches are not allowed, so this does not @@ -64,6 +80,50 @@ struct ForLoopMapper : public FunctionPass<ForLoopMapper> { unsigned numBlockDims; unsigned numThreadDims; }; + +// A pass that traverses top-level loops in the function and convertes them to +// GPU launch operations. The top-level loops itself does not have to be +// perfectly nested. The only requirement is that there be as many perfectly +// nested loops as the size of `numWorkGroups`. Within these any loop nest has +// to be perfectly nested upto depth equal to size of `workGroupSize`. +struct ImperfectlyNestedForLoopMapper + : public FunctionPass<ImperfectlyNestedForLoopMapper> { + ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups, + ArrayRef<int64_t> workGroupSize) + : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()), + workGroupSize(workGroupSize.begin(), workGroupSize.end()) {} + + void runOnFunction() override { + // Insert the num work groups and workgroup sizes as constant values. This + // pass is only used for testing. + FuncOp funcOp = getFunction(); + OpBuilder builder(funcOp.getOperation()->getRegion(0)); + SmallVector<Value *, 3> numWorkGroupsVal, workGroupSizeVal; + for (auto val : numWorkGroups) { + auto constOp = builder.create<ConstantOp>( + funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); + numWorkGroupsVal.push_back(constOp); + } + for (auto val : workGroupSize) { + auto constOp = builder.create<ConstantOp>( + funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); + workGroupSizeVal.push_back(constOp); + } + for (Block &block : getFunction()) { + for (Operation &op : llvm::make_early_inc_range(block)) { + if (auto forOp = dyn_cast<ForOp>(&op)) { + if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal, + workGroupSizeVal))) { + return signalPassFailure(); + } + } + } + } + } + SmallVector<int64_t, 3> numWorkGroups; + SmallVector<int64_t, 3> workGroupSize; +}; + } // namespace std::unique_ptr<OpPassBase<FuncOp>> @@ -72,8 +132,25 @@ mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims, return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims); } +std::unique_ptr<OpPassBase<FuncOp>> +mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups, + ArrayRef<int64_t> workGroupSize) { + return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups, + workGroupSize); +} + static PassRegistration<ForLoopMapper> registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] { return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(), clNumThreadDims.getValue()); }); + +static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU( + LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels", + [] { + SmallVector<int64_t, 3> numWorkGroups, workGroupSize; + numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end()); + workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end()); + return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups, + workGroupSize); + }); |