diff options
| author | Alex Zinenko <zinenko@google.com> | 2019-10-08 04:35:04 -0700 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-10-08 04:35:33 -0700 |
| commit | 16af5924cb6fd2bea3a4f9acf55aef4e796f5b11 (patch) | |
| tree | 3f495b08bf72a727feeb1946068371d498a9ad1d /mlir/lib/Conversion/GPUToCUDA | |
| parent | 90d65d32d69ca46f52a9a744eafdad0d97b4a185 (diff) | |
| download | bcm5719-llvm-16af5924cb6fd2bea3a4f9acf55aef4e796f5b11.tar.gz bcm5719-llvm-16af5924cb6fd2bea3a4f9acf55aef4e796f5b11.zip | |
Fuse GenerateCubinAccessors pass into LaunchFunctToCuda
Now that the accessor function is a trivial getter of the global variable, it
makes less sense to have the getter generation as a separate pass. Move the
getter generation into the lowering of `gpu.launch_func` to CUDA calls. This
change is mostly code motion, but the process can be simplified further by
generating the addressof inplace instead of using a call. This is will be done
in a follow-up.
PiperOrigin-RevId: 273492517
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
3 files changed, 58 insertions, 141 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt index fbaf36c25c9..4eddb787493 100644 --- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt @@ -4,7 +4,6 @@ if(MLIR_CUDA_CONVERSIONS_ENABLED) add_llvm_library(MLIRGPUtoCUDATransforms ConvertKernelFuncToCubin.cpp ConvertLaunchFuncToCudaCalls.cpp - GenerateCubinAccessors.cpp ) target_link_libraries(MLIRGPUtoCUDATransforms MLIRGPU diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index c0eb320e146..63da0fcfa46 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -51,7 +51,10 @@ static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr"; +static constexpr const char *kCubinAnnotation = "nvvm.cubin"; static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter"; +static constexpr const char *kCubinGetterSuffix = "_cubin"; +static constexpr const char *kCubinStorageSuffix = "_cubin_cst"; namespace { @@ -121,6 +124,7 @@ private: Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc, OpBuilder &builder); + FuncOp generateCubinAccessor(FuncOp kernelFunc, StringAttr blob); void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); public: @@ -131,10 +135,24 @@ public: // Cache the used LLVM types. initializeCachedTypes(); - for (auto func : getModule().getOps<FuncOp>()) { - func.walk( - [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); - } + getModule().walk([this](mlir::gpu::LaunchFuncOp op) { + auto gpuModule = + getModule().lookupSymbol<ModuleOp>(op.getKernelModuleName()); + auto kernelFunc = gpuModule.lookupSymbol<FuncOp>(op.kernel()); + auto cubinAttr = kernelFunc.getAttrOfType<StringAttr>(kCubinAnnotation); + if (!cubinAttr) + return signalPassFailure(); + FuncOp getter = generateCubinAccessor(kernelFunc, cubinAttr); + + // Store the name of the getter on the function for easier lookup and + // remove the original CUBIN annotation. + kernelFunc.setAttr( + kCubinGetterAnnotation, + SymbolRefAttr::get(getter.getName(), getter.getContext())); + kernelFunc.removeAttr(kCubinAnnotation); + + translateGpuLaunchCalls(op); + }); // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN data. @@ -317,6 +335,42 @@ Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant( llvmDialect); } +// Inserts a global constant string containing `blob` into the grand-parent +// module of `kernelFunc` and generates the function that returns the address of +// the first character of this string. +FuncOp GpuLaunchFuncToCudaCallsPass::generateCubinAccessor(FuncOp kernelFunc, + StringAttr blob) { + Location loc = kernelFunc.getLoc(); + SmallString<128> nameBuffer(kernelFunc.getName()); + ModuleOp module = getModule(); + assert(kernelFunc.getParentOp() && + kernelFunc.getParentOp()->getParentOp() == module && + "expected one level of module nesting"); + + // Insert the getter function just after the GPU kernel module containing + // `kernelFunc`. + OpBuilder moduleBuilder(module.getBody()); + moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp()); + auto getterType = moduleBuilder.getFunctionType( + llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect)); + nameBuffer.append(kCubinGetterSuffix); + auto result = moduleBuilder.create<FuncOp>( + loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>()); + Block *entryBlock = result.addEntryBlock(); + + // Drop the getter suffix before appending the storage suffix. + nameBuffer.resize(kernelFunc.getName().size()); + nameBuffer.append(kCubinStorageSuffix); + + // Obtain the address of the first character of the global string containing + // the cubin and return from the getter. + OpBuilder builder(entryBlock); + Value *startPtr = LLVM::createGlobalString( + loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect); + builder.create<LLVM::ReturnOp>(loc, startPtr); + return result; +} + // Emits LLVM IR to launch a kernel function. Expects the module that contains // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the // kernel function in the IR. diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp deleted file mode 100644 index 4b7a6b1620d..00000000000 --- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp +++ /dev/null @@ -1,136 +0,0 @@ -//===- GenerateCubinAccessors.cpp - MLIR GPU lowering passes --------------===// -// -// Copyright 2019 The MLIR Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= -// -// This file implements a pass to generate LLVMIR functions that return the -// data stored in nvvm.cubin char* blob. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" -#include "mlir/Dialect/GPU/GPUDialect.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Function.h" -#include "mlir/IR/Identifier.h" -#include "mlir/IR/Module.h" -#include "mlir/IR/StandardTypes.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassRegistry.h" - -#include "llvm/ADT/STLExtras.h" - -namespace mlir { -namespace { - -// TODO(herhut): Move to shared location. -constexpr const char *kCubinAnnotation = "nvvm.cubin"; -constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter"; -constexpr const char *kCubinGetterSuffix = "_cubin"; -constexpr const char *kCubinStorageSuffix = "_cubin_cst"; - -/// A pass which moves cubin from function attributes in nested modules -/// to global strings and generates getter functions. -/// -/// The GpuKernelToCubinPass annotates kernels functions with compiled device -/// code blobs. These functions reside in nested modules generated by -/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin -/// blobs back to the parent module as global strings and generates accessor -/// functions for them. The external kernel functions (also generated by the -/// outlining pass) are annotated with the symbol of the cubin accessor. -class GpuGenerateCubinAccessorsPass - : public ModulePass<GpuGenerateCubinAccessorsPass> { -private: - LLVM::LLVMType getIndexType() { - unsigned bits = - llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits(); - return LLVM::LLVMType::getIntNTy(llvmDialect, bits); - } - - // Inserts a global constant string containing `blob` into the grand-parent - // module of `kernelFunc` and generates the function that returns the address - // of the first character of this string. Returns the generator function. - // TODO(herhut): consider fusing this pass with launch-func-to-cuda. - FuncOp generate(FuncOp kernelFunc, StringAttr blob) { - Location loc = kernelFunc.getLoc(); - SmallString<128> nameBuffer(kernelFunc.getName()); - ModuleOp module = getModule(); - assert(kernelFunc.getParentOp() && - kernelFunc.getParentOp()->getParentOp() == module && - "expected one level of module nesting"); - - // Insert the getter function just after the original function. - OpBuilder moduleBuilder(module.getBody()); - moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp()); - auto getterType = moduleBuilder.getFunctionType( - llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect)); - nameBuffer.append(kCubinGetterSuffix); - auto result = moduleBuilder.create<FuncOp>( - loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>()); - Block *entryBlock = result.addEntryBlock(); - - // Drop the getter suffix before appending the storage suffix. - nameBuffer.resize(kernelFunc.getName().size()); - nameBuffer.append(kCubinStorageSuffix); - - // Obtain the address of the first character of the global string containing - // the cubin and return from the getter. - OpBuilder builder(entryBlock); - Value *startPtr = LLVM::createGlobalString( - loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect); - builder.create<LLVM::ReturnOp>(loc, startPtr); - return result; - } - -public: - void runOnModule() override { - llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>(); - - for (auto module : getModule().getOps<ModuleOp>()) { - if (!module.getAttrOfType<UnitAttr>( - gpu::GPUDialect::getKernelModuleAttrName())) - continue; - for (auto func : module.getOps<FuncOp>()) { - if (StringAttr blob = - func.getAttrOfType<StringAttr>(kCubinAnnotation)) { - FuncOp getter = generate(func, blob); - - // Store the name of the getter on the function for easier lookup and - // remove the CUBIN. - func.setAttr(kCubinGetterAnnotation, - SymbolRefAttr::get(getter.getName(), func.getContext())); - func.removeAttr(kCubinAnnotation); - } - } - } - } - -private: - LLVM::LLVMDialect *llvmDialect; -}; - -} // anonymous namespace - -std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass() { - return std::make_unique<GpuGenerateCubinAccessorsPass>(); -} - -static PassRegistration<GpuGenerateCubinAccessorsPass> - pass("generate-cubin-accessors", - "Generate LLVMIR functions that give access to cubin data"); - -} // namespace mlir |

