diff options
| author | MLIR Team <no-reply@google.com> | 2019-09-19 01:51:00 -0700 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-09-19 01:51:28 -0700 |
| commit | 5684a12434f923d03b6870f2aa16226bfb0b38b6 (patch) | |
| tree | 166de9efdea3b7e35228474662ceedde2d385de7 /mlir/lib/Conversion/GPUToCUDA | |
| parent | 25f0f769aa7de1338158becbe614856802d43e1a (diff) | |
| download | bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.tar.gz bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.zip | |
Outline GPU kernel function into a nested module.
When outlining GPU kernels, put the kernel function inside a nested module. Then use a nested pipeline to generate the cubins, independently per kernel. In a final pass, move the cubins back to the parent module.
PiperOrigin-RevId: 269987720
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
| -rw-r--r-- | mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp | 47 | ||||
| -rw-r--r-- | mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp | 54 |
2 files changed, 62 insertions, 39 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp index a69fe81b0d3..aa1711e3f8e 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -49,26 +49,37 @@ namespace { // TODO(herhut): Move to shared location. static constexpr const char *kCubinAnnotation = "nvvm.cubin"; -/// A pass converting tagged kernel functions to cubin blobs. +/// A pass converting tagged kernel modules to cubin blobs. +/// +/// If tagged as a kernel module, each contained function is translated to NVVM +/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to +/// GPU binary code, which is then attached as an attribute to the function. The +/// function body is erased. class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> { public: GpuKernelToCubinPass( CubinGenerator cubinGenerator = compilePtxToCubinForTesting) : cubinGenerator(cubinGenerator) {} - // Run the dialect converter on the module. void runOnModule() override { + if (!getModule().getAttrOfType<UnitAttr>( + gpu::GPUDialect::getKernelModuleAttrName())) + return; + // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTargetMC(); LLVMInitializeNVPTXAsmPrinter(); + auto llvmModule = translateModuleToNVVMIR(getModule()); + if (!llvmModule) + return signalPassFailure(); + for (auto function : getModule().getOps<FuncOp>()) { - if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) { + if (!gpu::GPUDialect::isKernel(function)) continue; - } - if (failed(translateGpuKernelToCubinAnnotation(function))) + if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function))) signalPassFailure(); } } @@ -79,8 +90,13 @@ private: std::string translateModuleToPtx(llvm::Module &module, llvm::TargetMachine &target_machine); + + /// Converts llvmModule to cubin using the user-provded generator. OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function); - LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function); + + /// Translates llvmModule to cubin and assigns it to attribute of function. + LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule, + FuncOp &function); CubinGenerator cubinGenerator; }; @@ -135,22 +151,13 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, return cubinGenerator(ptx, function); } -LogicalResult -GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) { - Builder builder(function.getContext()); - - OwningModuleRef module = ModuleOp::create(function.getLoc()); - - // TODO(herhut): Also handle called functions. - module->push_back(function.clone()); - - auto llvmModule = translateModuleToNVVMIR(*module); - auto cubin = convertModuleToCubin(*llvmModule, function); - - if (!cubin) { +LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation( + llvm::Module &llvmModule, FuncOp &function) { + auto cubin = convertModuleToCubin(llvmModule, function); + if (!cubin) return function.emitError("translation to CUDA binary failed."); - } + Builder builder(function.getContext()); function.setAttr(kCubinAnnotation, builder.getStringAttr({cubin->data(), cubin->size()})); diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp index f8c6f5d15ff..83c3538324b 100644 --- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp @@ -43,8 +43,15 @@ constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter"; constexpr const char *kCubinGetterSuffix = "_cubin"; constexpr const char *kCubinStorageSuffix = "_cubin_cst"; -/// A pass generating global strings and getter functions for all cubin blobs -/// annotated on functions via the nvvm.cubin attribute. +/// A pass which moves cubin from function attributes in nested modules +/// to global strings and generates getter functions. +/// +/// The GpuKernelToCubinPass annotates kernels functions with compiled device +/// code blobs. These functions reside in nested modules generated by +/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin +/// blobs back to the parent module as global strings and generates accessor +/// functions for them. The external kernel functions (also generated by the +/// outlining pass) are annotated with the symbol of the cubin accessor. class GpuGenerateCubinAccessorsPass : public ModulePass<GpuGenerateCubinAccessorsPass> { private: @@ -55,18 +62,25 @@ private: } // Inserts a global constant string containing `blob` into the parent module - // of `orig` and generates the function that returns the address of the first - // character of this string. + // of `kernelFunc` and generates the function that returns the address of the + // first character of this string. // TODO(herhut): consider fusing this pass with launch-func-to-cuda. - void generate(FuncOp orig, StringAttr blob) { - Location loc = orig.getLoc(); - SmallString<128> nameBuffer(orig.getName()); - auto module = orig.getParentOfType<ModuleOp>(); + void generate(FuncOp kernelFunc, StringAttr blob) { + auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName()); + if (!stubFunc) { + kernelFunc.emitError( + "corresponding external function not found in parent module"); + return signalPassFailure(); + } + + Location loc = stubFunc.getLoc(); + SmallString<128> nameBuffer(stubFunc.getName()); + auto module = stubFunc.getParentOfType<ModuleOp>(); assert(module && "function must belong to a module"); // Insert the getter function just after the original function. OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin()); - moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode()); + moduleBuilder.setInsertionPoint(stubFunc.getOperation()->getNextNode()); auto getterType = moduleBuilder.getFunctionType( llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect)); nameBuffer.append(kCubinGetterSuffix); @@ -75,7 +89,7 @@ private: Block *entryBlock = result.addEntryBlock(); // Drop the getter suffix before appending the storage suffix. - nameBuffer.resize(orig.getName().size()); + nameBuffer.resize(stubFunc.getName().size()); nameBuffer.append(kCubinStorageSuffix); // Obtain the address of the first character of the global string containing @@ -86,21 +100,23 @@ private: builder.create<LLVM::ReturnOp>(loc, startPtr); // Store the name of the getter on the function for easier lookup. - orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result)); + stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result)); } public: - // Perform the conversion on the module. This may insert globals, so it - // cannot be done on multiple functions in parallel. void runOnModule() override { - llvmDialect = - getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>(); + llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>(); - for (auto func : getModule().getOps<FuncOp>()) { - StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation); - if (!cubinBlob) + auto modules = getModule().getOps<ModuleOp>(); + for (auto module : llvm::make_early_inc_range(modules)) { + if (!module.getAttrOfType<UnitAttr>( + gpu::GPUDialect::getKernelModuleAttrName())) continue; - generate(func, cubinBlob); + for (auto func : module.getOps<FuncOp>()) { + if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation)) + generate(func, blob); + } + module.erase(); } } |

