summaryrefslogtreecommitdiffstats
path: root/mlir/lib/Conversion/GPUToCUDA
diff options
context:
space:
mode:
authorNicolas Vasilache <ntv@google.com>2019-09-27 09:55:38 -0700
committerA. Unique TensorFlower <gardener@tensorflow.org>2019-09-27 09:57:36 -0700
commitddf737c5da728f25c5e0413bc737d04b2d92df96 (patch)
tree07af0e3ba0f3755d11d329ccde15e8479ca04cd7 /mlir/lib/Conversion/GPUToCUDA
parent6543e99fe51b9077d8185ba9741770adc5f7cde5 (diff)
downloadbcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.tar.gz
bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.zip
Promote MemRefDescriptor to a pointer to struct when passing function boundaries in LLVMLowering.
The strided MemRef RFC discusses a normalized descriptor and interaction with library calls (https://groups.google.com/a/tensorflow.org/forum/#!topic/mlir/MaL8m2nXuio). Lowering of nested LLVM structs as value types does not play nicely with externally compiled C/C++ functions due to ABI issues. Solving the ABI problem generally is a very complex problem and most likely involves taking a dependence on clang that we do not want atm. A simple workaround is to pass pointers to memref descriptors at function boundaries, which this CL implement. PiperOrigin-RevId: 271591708
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp41
1 files changed, 37 insertions, 4 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index c53a52d4f20..961727d31c1 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -49,6 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
+static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
@@ -216,6 +217,15 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
},
getCUResultType())));
}
+ if (!module.lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr)) {
+ module.push_back(FuncOp::create(loc, kMcuMemHostRegisterPtr,
+ builder.getFunctionType(
+ {
+ getPointerType(), /* void *ptr */
+ getInt32Type() /* int32 flags*/
+ },
+ {})));
+ }
}
// Generates a parameters array to be used with a CUDA kernel launch call. The
@@ -229,22 +239,45 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
Value *
GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
OpBuilder &builder) {
+ auto numKernelOperands = launchOp.getNumKernelOperands();
Location loc = launchOp.getLoc();
auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(1));
+ // Provision twice as much for the `array` to allow up to one level of
+ // indirection for each argument.
auto arraySize = builder.create<LLVM::ConstantOp>(
- loc, getInt32Type(),
- builder.getI32IntegerAttr(launchOp.getNumKernelOperands()));
+ loc, getInt32Type(), builder.getI32IntegerAttr(numKernelOperands));
auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
arraySize, /*alignment=*/0);
- for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) {
+ for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
auto operand = launchOp.getKernelOperand(idx);
auto llvmType = operand->getType().cast<LLVM::LLVMType>();
- auto memLocation = builder.create<LLVM::AllocaOp>(
+ Value *memLocation = builder.create<LLVM::AllocaOp>(
loc, llvmType.getPointerTo(), one, /*alignment=*/1);
builder.create<LLVM::StoreOp>(loc, operand, memLocation);
auto casted =
builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+
+ // Assume all struct arguments come from MemRef. If this assumption does not
+ // hold anymore then we `launchOp` to lower from MemRefType and not after
+ // LLVMConversion has taken place and the MemRef information is lost.
+ // Extra level of indirection in the `array`:
+ // the descriptor pointer is registered via @mcuMemHostRegisterPtr
+ if (llvmType.isStructTy()) {
+ auto registerFunc =
+ getModule().lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr);
+ auto zero = builder.create<LLVM::ConstantOp>(
+ loc, getInt32Type(), builder.getI32IntegerAttr(0));
+ builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
+ builder.getSymbolRefAttr(registerFunc),
+ ArrayRef<Value *>{casted, zero});
+ Value *memLocation = builder.create<LLVM::AllocaOp>(
+ loc, getPointerPointerType(), one, /*alignment=*/1);
+ builder.create<LLVM::StoreOp>(loc, casted, memLocation);
+ casted =
+ builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+ }
+
auto index = builder.create<LLVM::ConstantOp>(
loc, getInt32Type(), builder.getI32IntegerAttr(idx));
auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array,
OpenPOWER on IntegriCloud