Outline GPU kernel function into a nested module.

When outlining GPU kernels, put the kernel function inside a nested module. Then use a nested pipeline to generate the cubins, independently per kernel. In a final pass, move the cubins back to the parent module. PiperOrigin-RevId: 269987720
author: MLIR Team <no-reply@google.com> 2019-09-19 01:51:00 -0700
committer: A. Unique TensorFlower <gardener@tensorflow.org> 2019-09-19 01:51:28 -0700
commit: 5684a12434f923d03b6870f2aa16226bfb0b38b6 (patch)
tree: 166de9efdea3b7e35228474662ceedde2d385de7 /mlir/lib/Conversion/GPUToCUDA
parent: 25f0f769aa7de1338158becbe614856802d43e1a (diff)
download: bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.tar.gz
bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.zip
2 files changed, 62 insertions, 39 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
index a69fe81b0d3..aa1711e3f8e 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -49,26 +49,37 @@ namespace {
 // TODO(herhut): Move to shared location.
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";
 
-/// A pass converting tagged kernel functions to cubin blobs.
+/// A pass converting tagged kernel modules to cubin blobs.
+///
+/// If tagged as a kernel module, each contained function is translated to NVVM
+/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
+/// GPU binary code, which is then attached as an attribute to the function. The
+/// function body is erased.
 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 public:
   GpuKernelToCubinPass(
       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
       : cubinGenerator(cubinGenerator) {}
 
-  // Run the dialect converter on the module.
   void runOnModule() override {
+    if (!getModule().getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()))
+      return;
+
     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
     LLVMInitializeNVPTXTargetInfo();
     LLVMInitializeNVPTXTargetMC();
     LLVMInitializeNVPTXAsmPrinter();
 
+    auto llvmModule = translateModuleToNVVMIR(getModule());
+    if (!llvmModule)
+      return signalPassFailure();
+
     for (auto function : getModule().getOps<FuncOp>()) {
-      if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
+      if (!gpu::GPUDialect::isKernel(function))
         continue;
-      }
-      if (failed(translateGpuKernelToCubinAnnotation(function)))
+      if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
         signalPassFailure();
     }
   }
@@ -79,8 +90,13 @@ private:
 
   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);
+
+  /// Converts llvmModule to cubin using the user-provded generator.
   OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
-  LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
+
+  /// Translates llvmModule to cubin and assigns it to attribute of function.
+  LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
+                                                    FuncOp &function);
 
   CubinGenerator cubinGenerator;
 };
@@ -135,22 +151,13 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
   return cubinGenerator(ptx, function);
 }
 
-LogicalResult
-GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
-  Builder builder(function.getContext());
-
-  OwningModuleRef module = ModuleOp::create(function.getLoc());
-
-  // TODO(herhut): Also handle called functions.
-  module->push_back(function.clone());
-
-  auto llvmModule = translateModuleToNVVMIR(*module);
-  auto cubin = convertModuleToCubin(*llvmModule, function);
-
-  if (!cubin) {
+LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
+    llvm::Module &llvmModule, FuncOp &function) {
+  auto cubin = convertModuleToCubin(llvmModule, function);
+  if (!cubin)
     return function.emitError("translation to CUDA binary failed.");
-  }
 
+  Builder builder(function.getContext());
   function.setAttr(kCubinAnnotation,
                    builder.getStringAttr({cubin->data(), cubin->size()}));
 
diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
index f8c6f5d15ff..83c3538324b 100644
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@@ -43,8 +43,15 @@ constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
 constexpr const char *kCubinGetterSuffix = "_cubin";
 constexpr const char *kCubinStorageSuffix = "_cubin_cst";
 
-/// A pass generating global strings and getter functions for all cubin blobs
-/// annotated on functions via the nvvm.cubin attribute.
+/// A pass which moves cubin from function attributes in nested modules
+/// to global strings and generates getter functions.
+///
+/// The GpuKernelToCubinPass annotates kernels functions with compiled device
+/// code blobs. These functions reside in nested modules generated by
+/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
+/// blobs back to the parent module as global strings and generates accessor
+/// functions for them. The external kernel functions (also generated by the
+/// outlining pass) are annotated with the symbol of the cubin accessor.
 class GpuGenerateCubinAccessorsPass
     : public ModulePass<GpuGenerateCubinAccessorsPass> {
 private:
@@ -55,18 +62,25 @@ private:
   }
 
   // Inserts a global constant string containing `blob` into the parent module
-  // of `orig` and generates the function that returns the address of the first
-  // character of this string.
+  // of `kernelFunc` and generates the function that returns the address of the
+  // first character of this string.
   // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
-  void generate(FuncOp orig, StringAttr blob) {
-    Location loc = orig.getLoc();
-    SmallString<128> nameBuffer(orig.getName());
-    auto module = orig.getParentOfType<ModuleOp>();
+  void generate(FuncOp kernelFunc, StringAttr blob) {
+    auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
+    if (!stubFunc) {
+      kernelFunc.emitError(
+          "corresponding external function not found in parent module");
+      return signalPassFailure();
+    }
+
+    Location loc = stubFunc.getLoc();
+    SmallString<128> nameBuffer(stubFunc.getName());
+    auto module = stubFunc.getParentOfType<ModuleOp>();
     assert(module && "function must belong to a module");
 
     // Insert the getter function just after the original function.
     OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
-    moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode());
+    moduleBuilder.setInsertionPoint(stubFunc.getOperation()->getNextNode());
     auto getterType = moduleBuilder.getFunctionType(
         llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
     nameBuffer.append(kCubinGetterSuffix);
@@ -75,7 +89,7 @@ private:
     Block *entryBlock = result.addEntryBlock();
 
     // Drop the getter suffix before appending the storage suffix.
-    nameBuffer.resize(orig.getName().size());
+    nameBuffer.resize(stubFunc.getName().size());
     nameBuffer.append(kCubinStorageSuffix);
 
     // Obtain the address of the first character of the global string containing
@@ -86,21 +100,23 @@ private:
     builder.create<LLVM::ReturnOp>(loc, startPtr);
 
     // Store the name of the getter on the function for easier lookup.
-    orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+    stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
   }
 
 public:
-  // Perform the conversion on the module.  This may insert globals, so it
-  // cannot be done on multiple functions in parallel.
   void runOnModule() override {
-    llvmDialect =
-        getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
 
-    for (auto func : getModule().getOps<FuncOp>()) {
-      StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation);
-      if (!cubinBlob)
+    auto modules = getModule().getOps<ModuleOp>();
+    for (auto module : llvm::make_early_inc_range(modules)) {
+      if (!module.getAttrOfType<UnitAttr>(
+              gpu::GPUDialect::getKernelModuleAttrName()))
         continue;
-      generate(func, cubinBlob);
+      for (auto func : module.getOps<FuncOp>()) {
+        if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
+          generate(func, blob);
+      }
+      module.erase();
     }
   }
author	MLIR Team <no-reply@google.com>	2019-09-19 01:51:00 -0700
committer	A. Unique TensorFlower <gardener@tensorflow.org>	2019-09-19 01:51:28 -0700
commit	5684a12434f923d03b6870f2aa16226bfb0b38b6 (patch)
tree	166de9efdea3b7e35228474662ceedde2d385de7 /mlir/lib/Conversion/GPUToCUDA
parent	25f0f769aa7de1338158becbe614856802d43e1a (diff)
download	bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.tar.gz bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.zip