Promote MemRefDescriptor to a pointer to struct when passing function boundaries in LLVMLowering.

The strided MemRef RFC discusses a normalized descriptor and interaction with library calls (https://groups.google.com/a/tensorflow.org/forum/#!topic/mlir/MaL8m2nXuio). Lowering of nested LLVM structs as value types does not play nicely with externally compiled C/C++ functions due to ABI issues. Solving the ABI problem generally is a very complex problem and most likely involves taking a dependence on clang that we do not want atm. A simple workaround is to pass pointers to memref descriptors at function boundaries, which this CL implement. PiperOrigin-RevId: 271591708
author: Nicolas Vasilache <ntv@google.com> 2019-09-27 09:55:38 -0700
committer: A. Unique TensorFlower <gardener@tensorflow.org> 2019-09-27 09:57:36 -0700
commit: ddf737c5da728f25c5e0413bc737d04b2d92df96 (patch)
tree: 07af0e3ba0f3755d11d329ccde15e8479ca04cd7 /mlir/lib/Conversion/GPUToCUDA
parent: 6543e99fe51b9077d8185ba9741770adc5f7cde5 (diff)
download: bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.tar.gz
bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.zip
1 files changed, 37 insertions, 4 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index c53a52d4f20..961727d31c1 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -49,6 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
 static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
 static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
 static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
+static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
 
 static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
 
@@ -216,6 +217,15 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
                            },
                            getCUResultType())));
   }
+  if (!module.lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr)) {
+    module.push_back(FuncOp::create(loc, kMcuMemHostRegisterPtr,
+                                    builder.getFunctionType(
+                                        {
+                                            getPointerType(), /* void *ptr */
+                                            getInt32Type()    /* int32 flags*/
+                                        },
+                                        {})));
+  }
 }
 
 // Generates a parameters array to be used with a CUDA kernel launch call. The
@@ -229,22 +239,45 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
 Value *
 GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
                                                OpBuilder &builder) {
+  auto numKernelOperands = launchOp.getNumKernelOperands();
   Location loc = launchOp.getLoc();
   auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
                                               builder.getI32IntegerAttr(1));
+  // Provision twice as much for the `array` to allow up to one level of
+  // indirection for each argument.
   auto arraySize = builder.create<LLVM::ConstantOp>(
-      loc, getInt32Type(),
-      builder.getI32IntegerAttr(launchOp.getNumKernelOperands()));
+      loc, getInt32Type(), builder.getI32IntegerAttr(numKernelOperands));
   auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
                                               arraySize, /*alignment=*/0);
-  for (int idx = 0, e = launchOp.getNumKernelOperands(); idx < e; ++idx) {
+  for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
     auto operand = launchOp.getKernelOperand(idx);
     auto llvmType = operand->getType().cast<LLVM::LLVMType>();
-    auto memLocation = builder.create<LLVM::AllocaOp>(
+    Value *memLocation = builder.create<LLVM::AllocaOp>(
         loc, llvmType.getPointerTo(), one, /*alignment=*/1);
     builder.create<LLVM::StoreOp>(loc, operand, memLocation);
     auto casted =
         builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+
+    // Assume all struct arguments come from MemRef. If this assumption does not
+    // hold anymore then we `launchOp` to lower from MemRefType and not after
+    // LLVMConversion has taken place and the MemRef information is lost.
+    // Extra level of indirection in the `array`:
+    //   the descriptor pointer is registered via @mcuMemHostRegisterPtr
+    if (llvmType.isStructTy()) {
+      auto registerFunc =
+          getModule().lookupSymbol<FuncOp>(kMcuMemHostRegisterPtr);
+      auto zero = builder.create<LLVM::ConstantOp>(
+          loc, getInt32Type(), builder.getI32IntegerAttr(0));
+      builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
+                                   builder.getSymbolRefAttr(registerFunc),
+                                   ArrayRef<Value *>{casted, zero});
+      Value *memLocation = builder.create<LLVM::AllocaOp>(
+          loc, getPointerPointerType(), one, /*alignment=*/1);
+      builder.create<LLVM::StoreOp>(loc, casted, memLocation);
+      casted =
+          builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+    }
+
     auto index = builder.create<LLVM::ConstantOp>(
         loc, getInt32Type(), builder.getI32IntegerAttr(idx));
     auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array,
author	Nicolas Vasilache <ntv@google.com>	2019-09-27 09:55:38 -0700
committer	A. Unique TensorFlower <gardener@tensorflow.org>	2019-09-27 09:57:36 -0700
commit	ddf737c5da728f25c5e0413bc737d04b2d92df96 (patch)
tree	07af0e3ba0f3755d11d329ccde15e8479ca04cd7 /mlir/lib/Conversion/GPUToCUDA
parent	6543e99fe51b9077d8185ba9741770adc5f7cde5 (diff)
download	bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.tar.gz bcm5719-llvm-ddf737c5da728f25c5e0413bc737d04b2d92df96.zip