//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h" #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/Pass/Pass.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/CommandLine.h" #define PASS_NAME "convert-loops-to-gpu" #define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu" using namespace mlir; using namespace mlir::loop; static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); static llvm::cl::opt clNumBlockDims("gpu-block-dims", llvm::cl::desc("Number of GPU block dimensions for mapping"), llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); static llvm::cl::opt clNumThreadDims( "gpu-thread-dims", llvm::cl::desc("Number of GPU thread dimensions for mapping"), llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME " options"); static llvm::cl::list clNumWorkGroups("gpu-num-workgroups", llvm::cl::desc("Num workgroups in the GPU launch"), llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clLoopOpToGPUCategory)); static llvm::cl::list clWorkGroupSize("gpu-workgroup-size", llvm::cl::desc("Workgroup Size in the GPU launch"), llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, llvm::cl::cat(clLoopOpToGPUCategory)); namespace { // A pass that traverses top-level loops in the function and converts them to // GPU launch operations. Nested launches are not allowed, so this does not // walk the function recursively to avoid considering nested loops. struct ForLoopMapper : public FunctionPass { ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {} void runOnFunction() override { for (Block &block : getFunction()) for (Operation &op : llvm::make_early_inc_range(block)) { if (auto forOp = dyn_cast(&op)) { if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) signalPassFailure(); } else if (auto forOp = dyn_cast(&op)) { if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) signalPassFailure(); } } } unsigned numBlockDims; unsigned numThreadDims; }; // A pass that traverses top-level loops in the function and convertes them to // GPU launch operations. The top-level loops itself does not have to be // perfectly nested. The only requirement is that there be as many perfectly // nested loops as the size of `numWorkGroups`. Within these any loop nest has // to be perfectly nested upto depth equal to size of `workGroupSize`. struct ImperfectlyNestedForLoopMapper : public FunctionPass { ImperfectlyNestedForLoopMapper(ArrayRef numWorkGroups, ArrayRef workGroupSize) : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()), workGroupSize(workGroupSize.begin(), workGroupSize.end()) {} void runOnFunction() override { // Insert the num work groups and workgroup sizes as constant values. This // pass is only used for testing. FuncOp funcOp = getFunction(); OpBuilder builder(funcOp.getOperation()->getRegion(0)); SmallVector numWorkGroupsVal, workGroupSizeVal; for (auto val : numWorkGroups) { auto constOp = builder.create( funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); numWorkGroupsVal.push_back(constOp); } for (auto val : workGroupSize) { auto constOp = builder.create( funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); workGroupSizeVal.push_back(constOp); } for (Block &block : getFunction()) { for (Operation &op : llvm::make_early_inc_range(block)) { if (auto forOp = dyn_cast(&op)) { if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal, workGroupSizeVal))) { return signalPassFailure(); } } } } } SmallVector numWorkGroups; SmallVector workGroupSize; }; } // namespace std::unique_ptr> mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims, unsigned numThreadDims) { return std::make_unique(numBlockDims, numThreadDims); } std::unique_ptr> mlir::createLoopToGPUPass(ArrayRef numWorkGroups, ArrayRef workGroupSize) { return std::make_unique(numWorkGroups, workGroupSize); } static PassRegistration registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] { return std::make_unique(clNumBlockDims.getValue(), clNumThreadDims.getValue()); }); static PassRegistration loopOpToGPU( LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels", [] { SmallVector numWorkGroups, workGroupSize; numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end()); workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end()); return std::make_unique(numWorkGroups, workGroupSize); });