summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/GPUToCUDA
diff options
context:
space:
mode:
authorAlex Zinenko <zinenko@google.com>2019-10-08 04:35:04 -0700
committerA. Unique TensorFlower <gardener@tensorflow.org>2019-10-08 04:35:33 -0700
commit16af5924cb6fd2bea3a4f9acf55aef4e796f5b11 (patch)
tree3f495b08bf72a727feeb1946068371d498a9ad1d /mlir/lib/Conversion/GPUToCUDA
parent90d65d32d69ca46f52a9a744eafdad0d97b4a185 (diff)
downloadbcm5719-llvm-16af5924cb6fd2bea3a4f9acf55aef4e796f5b11.tar.gz
bcm5719-llvm-16af5924cb6fd2bea3a4f9acf55aef4e796f5b11.zip
Fuse GenerateCubinAccessors pass into LaunchFunctToCuda
Now that the accessor function is a trivial getter of the global variable, it makes less sense to have the getter generation as a separate pass. Move the getter generation into the lowering of `gpu.launch_func` to CUDA calls. This change is mostly code motion, but the process can be simplified further by generating the addressof inplace instead of using a call. This is will be done in a follow-up. PiperOrigin-RevId: 273492517
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt1
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp62
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp136
3 files changed, 58 insertions, 141 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
index fbaf36c25c9..4eddb787493 100644
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -4,7 +4,6 @@ if(MLIR_CUDA_CONVERSIONS_ENABLED)
add_llvm_library(MLIRGPUtoCUDATransforms
ConvertKernelFuncToCubin.cpp
ConvertLaunchFuncToCudaCalls.cpp
- GenerateCubinAccessors.cpp
)
target_link_libraries(MLIRGPUtoCUDATransforms
MLIRGPU
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index c0eb320e146..63da0fcfa46 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -51,7 +51,10 @@ static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
+static constexpr const char *kCubinGetterSuffix = "_cubin";
+static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
namespace {
@@ -121,6 +124,7 @@ private:
Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
OpBuilder &builder);
+ FuncOp generateCubinAccessor(FuncOp kernelFunc, StringAttr blob);
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
public:
@@ -131,10 +135,24 @@ public:
// Cache the used LLVM types.
initializeCachedTypes();
- for (auto func : getModule().getOps<FuncOp>()) {
- func.walk(
- [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
- }
+ getModule().walk([this](mlir::gpu::LaunchFuncOp op) {
+ auto gpuModule =
+ getModule().lookupSymbol<ModuleOp>(op.getKernelModuleName());
+ auto kernelFunc = gpuModule.lookupSymbol<FuncOp>(op.kernel());
+ auto cubinAttr = kernelFunc.getAttrOfType<StringAttr>(kCubinAnnotation);
+ if (!cubinAttr)
+ return signalPassFailure();
+ FuncOp getter = generateCubinAccessor(kernelFunc, cubinAttr);
+
+ // Store the name of the getter on the function for easier lookup and
+ // remove the original CUBIN annotation.
+ kernelFunc.setAttr(
+ kCubinGetterAnnotation,
+ SymbolRefAttr::get(getter.getName(), getter.getContext()));
+ kernelFunc.removeAttr(kCubinAnnotation);
+
+ translateGpuLaunchCalls(op);
+ });
// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN data.
@@ -317,6 +335,42 @@ Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
llvmDialect);
}
+// Inserts a global constant string containing `blob` into the grand-parent
+// module of `kernelFunc` and generates the function that returns the address of
+// the first character of this string.
+FuncOp GpuLaunchFuncToCudaCallsPass::generateCubinAccessor(FuncOp kernelFunc,
+ StringAttr blob) {
+ Location loc = kernelFunc.getLoc();
+ SmallString<128> nameBuffer(kernelFunc.getName());
+ ModuleOp module = getModule();
+ assert(kernelFunc.getParentOp() &&
+ kernelFunc.getParentOp()->getParentOp() == module &&
+ "expected one level of module nesting");
+
+ // Insert the getter function just after the GPU kernel module containing
+ // `kernelFunc`.
+ OpBuilder moduleBuilder(module.getBody());
+ moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
+ auto getterType = moduleBuilder.getFunctionType(
+ llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
+ nameBuffer.append(kCubinGetterSuffix);
+ auto result = moduleBuilder.create<FuncOp>(
+ loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
+ Block *entryBlock = result.addEntryBlock();
+
+ // Drop the getter suffix before appending the storage suffix.
+ nameBuffer.resize(kernelFunc.getName().size());
+ nameBuffer.append(kCubinStorageSuffix);
+
+ // Obtain the address of the first character of the global string containing
+ // the cubin and return from the getter.
+ OpBuilder builder(entryBlock);
+ Value *startPtr = LLVM::createGlobalString(
+ loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
+ builder.create<LLVM::ReturnOp>(loc, startPtr);
+ return result;
+}
+
// Emits LLVM IR to launch a kernel function. Expects the module that contains
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
// kernel function in the IR.
diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
deleted file mode 100644
index 4b7a6b1620d..00000000000
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===- GenerateCubinAccessors.cpp - MLIR GPU lowering passes --------------===//
-//
-// Copyright 2019 The MLIR Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-//
-// This file implements a pass to generate LLVMIR functions that return the
-// data stored in nvvm.cubin char* blob.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Identifier.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-
-#include "llvm/ADT/STLExtras.h"
-
-namespace mlir {
-namespace {
-
-// TODO(herhut): Move to shared location.
-constexpr const char *kCubinAnnotation = "nvvm.cubin";
-constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
-constexpr const char *kCubinGetterSuffix = "_cubin";
-constexpr const char *kCubinStorageSuffix = "_cubin_cst";
-
-/// A pass which moves cubin from function attributes in nested modules
-/// to global strings and generates getter functions.
-///
-/// The GpuKernelToCubinPass annotates kernels functions with compiled device
-/// code blobs. These functions reside in nested modules generated by
-/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
-/// blobs back to the parent module as global strings and generates accessor
-/// functions for them. The external kernel functions (also generated by the
-/// outlining pass) are annotated with the symbol of the cubin accessor.
-class GpuGenerateCubinAccessorsPass
- : public ModulePass<GpuGenerateCubinAccessorsPass> {
-private:
- LLVM::LLVMType getIndexType() {
- unsigned bits =
- llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
- return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
- }
-
- // Inserts a global constant string containing `blob` into the grand-parent
- // module of `kernelFunc` and generates the function that returns the address
- // of the first character of this string. Returns the generator function.
- // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
- FuncOp generate(FuncOp kernelFunc, StringAttr blob) {
- Location loc = kernelFunc.getLoc();
- SmallString<128> nameBuffer(kernelFunc.getName());
- ModuleOp module = getModule();
- assert(kernelFunc.getParentOp() &&
- kernelFunc.getParentOp()->getParentOp() == module &&
- "expected one level of module nesting");
-
- // Insert the getter function just after the original function.
- OpBuilder moduleBuilder(module.getBody());
- moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
- auto getterType = moduleBuilder.getFunctionType(
- llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
- nameBuffer.append(kCubinGetterSuffix);
- auto result = moduleBuilder.create<FuncOp>(
- loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
- Block *entryBlock = result.addEntryBlock();
-
- // Drop the getter suffix before appending the storage suffix.
- nameBuffer.resize(kernelFunc.getName().size());
- nameBuffer.append(kCubinStorageSuffix);
-
- // Obtain the address of the first character of the global string containing
- // the cubin and return from the getter.
- OpBuilder builder(entryBlock);
- Value *startPtr = LLVM::createGlobalString(
- loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
- builder.create<LLVM::ReturnOp>(loc, startPtr);
- return result;
- }
-
-public:
- void runOnModule() override {
- llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
-
- for (auto module : getModule().getOps<ModuleOp>()) {
- if (!module.getAttrOfType<UnitAttr>(
- gpu::GPUDialect::getKernelModuleAttrName()))
- continue;
- for (auto func : module.getOps<FuncOp>()) {
- if (StringAttr blob =
- func.getAttrOfType<StringAttr>(kCubinAnnotation)) {
- FuncOp getter = generate(func, blob);
-
- // Store the name of the getter on the function for easier lookup and
- // remove the CUBIN.
- func.setAttr(kCubinGetterAnnotation,
- SymbolRefAttr::get(getter.getName(), func.getContext()));
- func.removeAttr(kCubinAnnotation);
- }
- }
- }
- }
-
-private:
- LLVM::LLVMDialect *llvmDialect;
-};
-
-} // anonymous namespace
-
-std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass() {
- return std::make_unique<GpuGenerateCubinAccessorsPass>();
-}
-
-static PassRegistration<GpuGenerateCubinAccessorsPass>
- pass("generate-cubin-accessors",
- "Generate LLVMIR functions that give access to cubin data");
-
-} // namespace mlir
OpenPOWER on IntegriCloud