summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/GPUToCUDA
diff options
context:
space:
mode:
authorMLIR Team <no-reply@google.com>2019-09-19 01:51:00 -0700
committerA. Unique TensorFlower <gardener@tensorflow.org>2019-09-19 01:51:28 -0700
commit5684a12434f923d03b6870f2aa16226bfb0b38b6 (patch)
tree166de9efdea3b7e35228474662ceedde2d385de7 /mlir/lib/Conversion/GPUToCUDA
parent25f0f769aa7de1338158becbe614856802d43e1a (diff)
downloadbcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.tar.gz
bcm5719-llvm-5684a12434f923d03b6870f2aa16226bfb0b38b6.zip
Outline GPU kernel function into a nested module.
When outlining GPU kernels, put the kernel function inside a nested module. Then use a nested pipeline to generate the cubins, independently per kernel. In a final pass, move the cubins back to the parent module. PiperOrigin-RevId: 269987720
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp47
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp54
2 files changed, 62 insertions, 39 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
index a69fe81b0d3..aa1711e3f8e 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -49,26 +49,37 @@ namespace {
// TODO(herhut): Move to shared location.
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
-/// A pass converting tagged kernel functions to cubin blobs.
+/// A pass converting tagged kernel modules to cubin blobs.
+///
+/// If tagged as a kernel module, each contained function is translated to NVVM
+/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
+/// GPU binary code, which is then attached as an attribute to the function. The
+/// function body is erased.
class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
public:
GpuKernelToCubinPass(
CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
: cubinGenerator(cubinGenerator) {}
- // Run the dialect converter on the module.
void runOnModule() override {
+ if (!getModule().getAttrOfType<UnitAttr>(
+ gpu::GPUDialect::getKernelModuleAttrName()))
+ return;
+
// Make sure the NVPTX target is initialized.
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
+ auto llvmModule = translateModuleToNVVMIR(getModule());
+ if (!llvmModule)
+ return signalPassFailure();
+
for (auto function : getModule().getOps<FuncOp>()) {
- if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
+ if (!gpu::GPUDialect::isKernel(function))
continue;
- }
- if (failed(translateGpuKernelToCubinAnnotation(function)))
+ if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
signalPassFailure();
}
}
@@ -79,8 +90,13 @@ private:
std::string translateModuleToPtx(llvm::Module &module,
llvm::TargetMachine &target_machine);
+
+ /// Converts llvmModule to cubin using the user-provded generator.
OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
- LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
+
+ /// Translates llvmModule to cubin and assigns it to attribute of function.
+ LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
+ FuncOp &function);
CubinGenerator cubinGenerator;
};
@@ -135,22 +151,13 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
return cubinGenerator(ptx, function);
}
-LogicalResult
-GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
- Builder builder(function.getContext());
-
- OwningModuleRef module = ModuleOp::create(function.getLoc());
-
- // TODO(herhut): Also handle called functions.
- module->push_back(function.clone());
-
- auto llvmModule = translateModuleToNVVMIR(*module);
- auto cubin = convertModuleToCubin(*llvmModule, function);
-
- if (!cubin) {
+LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
+ llvm::Module &llvmModule, FuncOp &function) {
+ auto cubin = convertModuleToCubin(llvmModule, function);
+ if (!cubin)
return function.emitError("translation to CUDA binary failed.");
- }
+ Builder builder(function.getContext());
function.setAttr(kCubinAnnotation,
builder.getStringAttr({cubin->data(), cubin->size()}));
diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
index f8c6f5d15ff..83c3538324b 100644
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@@ -43,8 +43,15 @@ constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
constexpr const char *kCubinGetterSuffix = "_cubin";
constexpr const char *kCubinStorageSuffix = "_cubin_cst";
-/// A pass generating global strings and getter functions for all cubin blobs
-/// annotated on functions via the nvvm.cubin attribute.
+/// A pass which moves cubin from function attributes in nested modules
+/// to global strings and generates getter functions.
+///
+/// The GpuKernelToCubinPass annotates kernels functions with compiled device
+/// code blobs. These functions reside in nested modules generated by
+/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
+/// blobs back to the parent module as global strings and generates accessor
+/// functions for them. The external kernel functions (also generated by the
+/// outlining pass) are annotated with the symbol of the cubin accessor.
class GpuGenerateCubinAccessorsPass
: public ModulePass<GpuGenerateCubinAccessorsPass> {
private:
@@ -55,18 +62,25 @@ private:
}
// Inserts a global constant string containing `blob` into the parent module
- // of `orig` and generates the function that returns the address of the first
- // character of this string.
+ // of `kernelFunc` and generates the function that returns the address of the
+ // first character of this string.
// TODO(herhut): consider fusing this pass with launch-func-to-cuda.
- void generate(FuncOp orig, StringAttr blob) {
- Location loc = orig.getLoc();
- SmallString<128> nameBuffer(orig.getName());
- auto module = orig.getParentOfType<ModuleOp>();
+ void generate(FuncOp kernelFunc, StringAttr blob) {
+ auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
+ if (!stubFunc) {
+ kernelFunc.emitError(
+ "corresponding external function not found in parent module");
+ return signalPassFailure();
+ }
+
+ Location loc = stubFunc.getLoc();
+ SmallString<128> nameBuffer(stubFunc.getName());
+ auto module = stubFunc.getParentOfType<ModuleOp>();
assert(module && "function must belong to a module");
// Insert the getter function just after the original function.
OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
- moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode());
+ moduleBuilder.setInsertionPoint(stubFunc.getOperation()->getNextNode());
auto getterType = moduleBuilder.getFunctionType(
llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
nameBuffer.append(kCubinGetterSuffix);
@@ -75,7 +89,7 @@ private:
Block *entryBlock = result.addEntryBlock();
// Drop the getter suffix before appending the storage suffix.
- nameBuffer.resize(orig.getName().size());
+ nameBuffer.resize(stubFunc.getName().size());
nameBuffer.append(kCubinStorageSuffix);
// Obtain the address of the first character of the global string containing
@@ -86,21 +100,23 @@ private:
builder.create<LLVM::ReturnOp>(loc, startPtr);
// Store the name of the getter on the function for easier lookup.
- orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+ stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
}
public:
- // Perform the conversion on the module. This may insert globals, so it
- // cannot be done on multiple functions in parallel.
void runOnModule() override {
- llvmDialect =
- getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+ llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
- for (auto func : getModule().getOps<FuncOp>()) {
- StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation);
- if (!cubinBlob)
+ auto modules = getModule().getOps<ModuleOp>();
+ for (auto module : llvm::make_early_inc_range(modules)) {
+ if (!module.getAttrOfType<UnitAttr>(
+ gpu::GPUDialect::getKernelModuleAttrName()))
continue;
- generate(func, cubinBlob);
+ for (auto func : module.getOps<FuncOp>()) {
+ if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
+ generate(func, blob);
+ }
+ module.erase();
}
}
OpenPOWER on IntegriCloud