diff options
| author | Nicolas Vasilache <ntv@google.com> | 2019-09-27 09:55:38 -0700 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-09-27 09:57:36 -0700 |
| commit | ddf737c5da728f25c5e0413bc737d04b2d92df96 (patch) | |
| tree | 07af0e3ba0f3755d11d329ccde15e8479ca04cd7 /mlir/lib/Conversion/GPUToCUDA | |
| parent | 6543e99fe51b9077d8185ba9741770adc5f7cde5 (diff) | |
| download | bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.tar.gz bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.zip | |
Promote MemRefDescriptor to a pointer to struct when passing function boundaries in LLVMLowering.
The strided MemRef RFC discusses a normalized descriptor and interaction with library calls (https://groups.google.com/a/tensorflow.org/forum/#!topic/mlir/MaL8m2nXuio).
Lowering of nested LLVM structs as value types does not play nicely with externally compiled C/C++ functions due to ABI issues.
Solving the ABI problem generally is a very complex problem and most likely involves taking
a dependence on clang that we do not want atm.
A simple workaround is to pass pointers to memref descriptors at function boundaries, which this CL implement.
PiperOrigin-RevId: 271591708
Diffstat (limited to 'mlir/lib/Conversion/GPUToCUDA')
| -rw-r--r-- | mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp | 41 |
1 files changed, 37 insertions, 4 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index c53a52d4f20..961727d31c1 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -49,6 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction"; static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel"; static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; +static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr"; static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter"; @@ -216,6 +217,15 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { }, getCUResultType()))); } + if (!module.lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr)) { + module.push_back(FuncOp::create(loc, kMcuMemHostRegisterPtr, + builder.getFunctionType( + { + getPointerType(), /* void *ptr */ + getInt32Type() /* int32 flags*/ + }, + {}))); + } } // Generates a parameters array to be used with a CUDA kernel launch call. The @@ -229,22 +239,45 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { Value * GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder) { + auto numKernelOperands = launchOp.getNumKernelOperands(); Location loc = launchOp.getLoc(); auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), builder.getI32IntegerAttr(1)); + // Provision twice as much for the `array` to allow up to one level of + // indirection for each argument. auto arraySize = builder.create<LLVM::ConstantOp>( - loc, getInt32Type(), - builder.getI32IntegerAttr(launchOp.getNumKernelOperands())); + loc, getInt32Type(), builder.getI32IntegerAttr(numKernelOperands)); auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), arraySize, /*alignment=*/0); - for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) { + for (unsigned idx = 0; idx < numKernelOperands; ++idx) { auto operand = launchOp.getKernelOperand(idx); auto llvmType = operand->getType().cast<LLVM::LLVMType>(); - auto memLocation = builder.create<LLVM::AllocaOp>( + Value *memLocation = builder.create<LLVM::AllocaOp>( loc, llvmType.getPointerTo(), one, /*alignment=*/1); builder.create<LLVM::StoreOp>(loc, operand, memLocation); auto casted = builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation); + + // Assume all struct arguments come from MemRef. If this assumption does not + // hold anymore then we `launchOp` to lower from MemRefType and not after + // LLVMConversion has taken place and the MemRef information is lost. + // Extra level of indirection in the `array`: + // the descriptor pointer is registered via @mcuMemHostRegisterPtr + if (llvmType.isStructTy()) { + auto registerFunc = + getModule().lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr); + auto zero = builder.create<LLVM::ConstantOp>( + loc, getInt32Type(), builder.getI32IntegerAttr(0)); + builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{}, + builder.getSymbolRefAttr(registerFunc), + ArrayRef<Value *>{casted, zero}); + Value *memLocation = builder.create<LLVM::AllocaOp>( + loc, getPointerPointerType(), one, /*alignment=*/1); + builder.create<LLVM::StoreOp>(loc, casted, memLocation); + casted = + builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation); + } + auto index = builder.create<LLVM::ConstantOp>( loc, getInt32Type(), builder.getI32IntegerAttr(idx)); auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array, |

