GPUToCUDA: attach CUBIN to the nested module rather than to the function

Originally, we were attaching attributes containing CUBIN blobs to the kernel function called by `gpu.launch_func`. This kernel is now contained in a nested module that is used as a compilation unit. Attach compiled CUBIN blobs to the module rather than to the function since we were compiling the module. This also avoids duplication of the attribute on multiple kernels within the same module. PiperOrigin-RevId: 273497303
author: Alex Zinenko <zinenko@google.com> 2019-10-08 05:11:00 -0700
committer: A. Unique TensorFlower <gardener@tensorflow.org> 2019-10-08 05:11:26 -0700
commit: 11d12670daef546f55cc76d8fe0b32f137ab3bb6 (patch)
tree: 6b0b7a5100f517e966eb35001118df42b790d22f /mlir/lib/Conversion/GPUToCUDA
parent: 52e082b6ed964ad408abc637b995bc13ff2fb122 (diff)
download: bcm5719-llvm-11d12670daef546f55cc76d8fe0b32f137ab3bb6.tar.gz
bcm5719-llvm-11d12670daef546f55cc76d8fe0b32f137ab3bb6.zip
2 files changed, 38 insertions, 46 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
index aa1711e3f8e..c76381f9d0a 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -62,8 +62,10 @@ public:
       : cubinGenerator(cubinGenerator) {}
 
   void runOnModule() override {
-    if (!getModule().getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelModuleAttrName()))
+    ModuleOp module = getModule();
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
       return;
 
     // Make sure the NVPTX target is initialized.
@@ -72,31 +74,35 @@ public:
     LLVMInitializeNVPTXTargetMC();
     LLVMInitializeNVPTXAsmPrinter();
 
-    auto llvmModule = translateModuleToNVVMIR(getModule());
+    auto llvmModule = translateModuleToNVVMIR(module);
     if (!llvmModule)
       return signalPassFailure();
 
-    for (auto function : getModule().getOps<FuncOp>()) {
-      if (!gpu::GPUDialect::isKernel(function))
-        continue;
-      if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
-        signalPassFailure();
-    }
+    // Translate the module to CUBIN and attach the result as attribute to the
+    // module.
+    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), *module.getName()))
+      module.setAttr(kCubinAnnotation, cubinAttr);
+    else
+      signalPassFailure();
   }
 
 private:
   static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
-                                                FuncOp &function);
+                                                Location, StringRef);
 
   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);
 
-  /// Converts llvmModule to cubin using the user-provded generator.
-  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
+  /// Converts llvmModule to cubin using the user-provded generator. Location is
+  /// used for error reporting and name is forwarded to the CUBIN generator to
+  /// use in its logging mechanisms.
+  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
+                                  StringRef name);
 
-  /// Translates llvmModule to cubin and assigns it to attribute of function.
-  LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
-                                                    FuncOp &function);
+  /// Translates llvmModule to cubin and returns the result as attribute.
+  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
+                                                 Location loc, StringRef name);
 
   CubinGenerator cubinGenerator;
 };
@@ -120,13 +126,14 @@ std::string GpuKernelToCubinPass::translateModuleToPtx(
 
 OwnedCubin
 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
-                                                  FuncOp &function) {
+                                                  Location, StringRef) {
   const char data[] = "CUBIN";
   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
 }
 
 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
-                                                      FuncOp &function) {
+                                                      Location loc,
+                                                      StringRef name) {
   std::unique_ptr<llvm::TargetMachine> targetMachine;
   {
     std::string error;
@@ -136,7 +143,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
     const llvm::Target *target =
         llvm::TargetRegistry::lookupTarget("", triple, error);
     if (target == nullptr) {
-      function.emitError("cannot initialize target triple");
+      emitError(loc, "cannot initialize target triple");
       return {};
     }
     targetMachine.reset(
@@ -148,26 +155,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
 
   auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
 
-  return cubinGenerator(ptx, function);
+  return cubinGenerator(ptx, loc, name);
 }
 
-LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
-    llvm::Module &llvmModule, FuncOp &function) {
-  auto cubin = convertModuleToCubin(llvmModule, function);
+StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto cubin = convertModuleToCubin(llvmModule, loc, name);
   if (!cubin)
-    return function.emitError("translation to CUDA binary failed.");
-
-  Builder builder(function.getContext());
-  function.setAttr(kCubinAnnotation,
-                   builder.getStringAttr({cubin->data(), cubin->size()}));
-
-  // Remove the body of the kernel function now that it has been translated.
-  // The main reason to do this is so that the resulting module no longer
-  // contains the NVVM instructions (typically contained in the kernel bodies)
-  // and hence can be compiled into host code by a separate pass.
-  function.eraseBody();
-
-  return success();
+    return {};
+  return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
 }
 
 std::unique_ptr<OpPassBase<ModuleOp>>
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index d8e4267c484..450269dbdac 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -120,7 +120,7 @@ private:
 
   void declareCudaFunctions(Location loc);
   Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
-  Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
+  Value *generateKernelNameConstant(StringRef name, Location &loc,
                                     OpBuilder &builder);
   void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
 
@@ -304,14 +304,12 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
 //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
 // }
 Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
-    FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
+    StringRef name, Location &loc, OpBuilder &builder) {
   // Make sure the trailing zero is included in the constant.
-  std::vector<char> kernelName(kernelFunction.getName().begin(),
-                               kernelFunction.getName().end());
+  std::vector<char> kernelName(name.begin(), name.end());
   kernelName.push_back('\0');
 
-  std::string globalName =
-      llvm::formatv("{0}_kernel_name", kernelFunction.getName());
+  std::string globalName = llvm::formatv("{0}_kernel_name", name);
   return LLVM::createGlobalString(
       loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
       llvmDialect);
@@ -350,12 +348,10 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   auto kernelModule =
       getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
-  auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
-  assert(kernelFunction && "expected a kernel function");
 
-  auto cubinAttr = kernelFunction.getAttrOfType<StringAttr>(kCubinAnnotation);
+  auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
   if (!cubinAttr) {
-    kernelFunction.emitOpError()
+    kernelModule.emitOpError()
         << "missing " << kCubinAnnotation << " attribute";
     return signalPassFailure();
   }
@@ -376,7 +372,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   // the kernel function.
   auto cuOwningModuleRef =
       builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
-  auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
+  auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
   auto cuFunction = allocatePointer(builder, loc);
   FuncOp cuModuleGetFunction =
       getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);
author	Alex Zinenko <zinenko@google.com>	2019-10-08 05:11:00 -0700
committer	A. Unique TensorFlower <gardener@tensorflow.org>	2019-10-08 05:11:26 -0700
commit	11d12670daef546f55cc76d8fe0b32f137ab3bb6 (patch)
tree	6b0b7a5100f517e966eb35001118df42b790d22f /mlir/lib/Conversion/GPUToCUDA
parent	52e082b6ed964ad408abc637b995bc13ff2fb122 (diff)
download	bcm5719-llvm-11d12670daef546f55cc76d8fe0b32f137ab3bb6.tar.gz bcm5719-llvm-11d12670daef546f55cc76d8fe0b32f137ab3bb6.zip