Make type and rank explicit in mcuMemHostRegister function.

Fix registered size of indirect MemRefType kernel arguments. PiperOrigin-RevId: 281362940
author: Christian Sigg <csigg@google.com> 2019-11-19 13:12:19 -0800
committer: A. Unique TensorFlower <gardener@tensorflow.org> 2019-11-19 13:13:02 -0800
commit: f868adafee91a8c3ebee1e052d5fdfff7be0afd0 (patch)
tree: 34ef3b730148fa01a2979dac18c0dbead9ad9a86
parent: ee95f6f2594e9089990024208d01634fd81d2da2 (diff)
download: bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.tar.gz
bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.zip
5 files changed, 40 insertions, 35 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index d9332428425..9d8c8942051 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -49,7 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
 static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
 static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
 static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
-static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
+static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
 
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";
 static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
@@ -228,13 +228,13 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
                                       getPointerType() /* CUstream stream */,
                                       /*isVarArg=*/false));
   }
-  if (!module.lookupSymbol(kMcuMemHostRegisterPtr)) {
+  if (!module.lookupSymbol(kMcuMemHostRegister)) {
     builder.create<LLVM::LLVMFuncOp>(
-        loc, kMcuMemHostRegisterPtr,
+        loc, kMcuMemHostRegister,
         LLVM::LLVMType::getFunctionTy(getVoidType(),
                                       {
                                           getPointerType(), /* void *ptr */
-                                          getInt32Type()    /* int32 flags*/
+                                          getInt64Type()    /* int64 sizeBytes*/
                                       },
                                       /*isVarArg=*/false));
   }
@@ -277,12 +277,14 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
     //   the descriptor pointer is registered via @mcuMemHostRegisterPtr
     if (llvmType.isStructTy()) {
       auto registerFunc =
-          getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegisterPtr);
-      auto zero = builder.create<LLVM::ConstantOp>(
-          loc, getInt32Type(), builder.getI32IntegerAttr(0));
+          getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegister);
+      auto nullPtr = builder.create<LLVM::NullOp>(loc, llvmType.getPointerTo());
+      auto gep = builder.create<LLVM::GEPOp>(loc, llvmType.getPointerTo(),
+                                             ArrayRef<Value *>{nullPtr, one});
+      auto size = builder.create<LLVM::PtrToIntOp>(loc, getInt64Type(), gep);
       builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
                                    builder.getSymbolRefAttr(registerFunc),
-                                   ArrayRef<Value *>{casted, zero});
+                                   ArrayRef<Value *>{casted, size});
       Value *memLocation = builder.create<LLVM::AllocaOp>(
           loc, getPointerPointerType(), one, /*alignment=*/1);
       builder.create<LLVM::StoreOp>(loc, casted, memLocation);
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
index 6356db0791a..c863623a360 100644
--- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
@@ -4,12 +4,11 @@
 func @main() {
   %arg = alloc() : memref<13x4x2xf32>
   %dst = memref_cast %arg : memref<13x4x2xf32> to memref<?x?x?xf32>
-  %zero = constant 0 : i32
   %one = constant 1 : index
   %sx = dim %dst, 0 : memref<?x?x?xf32>
   %sy = dim %dst, 1 : memref<?x?x?xf32>
   %sz = dim %dst, 2 : memref<?x?x?xf32>
-  call @mcuMemHostRegister(%dst, %zero) : (memref<?x?x?xf32>, i32) -> ()
+  call @mcuMemHostRegisterMemRef3dFloat(%dst) : (memref<?x?x?xf32>) -> ()
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz)
              args(%kernel_dst = %dst) : memref<?x?x?xf32> {
@@ -27,5 +26,5 @@ func @main() {
   return
 }
 
-func @mcuMemHostRegister(%ptr : memref<?x?x?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef3dFloat(%ptr : memref<?x?x?xf32>)
 func @mcuPrintFloat(%ptr : memref<?x?x?xf32>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
index 1a3a4595d13..5cb803cfb7d 100644
--- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
@@ -4,10 +4,9 @@
 func @main() {
   %arg = alloc() : memref<35xf32>
   %dst = memref_cast %arg : memref<35xf32> to memref<?xf32>
-  %zero = constant 0 : i32
   %one = constant 1 : index
   %sx = dim %dst, 0 : memref<?xf32>
-  call @mcuMemHostRegister(%dst, %zero) : (memref<?xf32>, i32) -> ()
+  call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref<?xf32>) -> ()
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one)
              args(%kernel_dst = %dst) : memref<?xf32> {
@@ -25,5 +24,5 @@ func @main() {
   return
 }
 
-func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
 func @mcuPrintFloat(%ptr : memref<?xf32>)
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
index 871327ff819..98e0832d5ba 100644
--- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
+++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
@@ -15,10 +15,9 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
 // CHECK: [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00]
 func @main() {
   %arg0 = alloc() : memref<5xf32>
-  %20 = constant 0 : i32
   %21 = constant 5 : i32
   %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
-  call @mcuMemHostRegister(%22, %20) : (memref<?xf32>, i32) -> ()
+  call @mcuMemHostRegisterMemRef1dFloat(%22) : (memref<?xf32>) -> ()
   call @mcuPrintFloat(%22) : (memref<?xf32>) -> ()
   %24 = constant 1.0 : f32
   call @other_func(%24, %22) : (f32, memref<?xf32>) -> ()
@@ -26,5 +25,5 @@ func @main() {
   return
 }
 
-func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
 func @mcuPrintFloat(%ptr : memref<?xf32>)
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
index 31b6f6f6209..ac772589282 100644
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <assert.h>
-#include <memory.h>
+#include <cassert>
+#include <numeric>
 
 #include "llvm/Support/raw_ostream.h"
 
@@ -80,6 +80,13 @@ extern "C" int32_t mcuStreamSynchronize(void *stream) {
 
 /// Helper functions for writing mlir example code
 
+// Allows to register byte array with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+  reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
+                   "MemHostRegister");
+}
+
 // A struct that corresponds to how MLIR represents memrefs.
 template <typename T, int N> struct MemRefType {
   T *basePtr;
@@ -89,23 +96,22 @@ template <typename T, int N> struct MemRefType {
   int64_t strides[N];
 };
 
-// Allows to register a pointer with the CUDA runtime. Helpful until
-// we have transfer functions implemented.
-extern "C" void mcuMemHostRegister(const MemRefType<float, 1> *arg,
-                                   int32_t flags) {
-  reportErrorIfAny(
-      cuMemHostRegister(arg->data, arg->sizes[0] * sizeof(float), flags),
-      "MemHostRegister");
-  for (int pos = 0; pos < arg->sizes[0]; pos++) {
-    arg->data[pos] = 1.23f;
-  }
+// Allows to register a MemRef with the CUDA runtime. Initializes array with
+// value. Helpful until we have transfer functions implemented.
+template <typename T, int N>
+void mcuMemHostRegisterMemRef(const MemRefType<T, N> *arg, T value) {
+  auto count = std::accumulate(arg->sizes, arg->sizes + N, 1,
+                               std::multiplies<int64_t>());
+  std::fill_n(arg->data, count, value);
+  mcuMemHostRegister(arg->data, count * sizeof(T));
 }
-
-// Allows to register a pointer with the CUDA runtime. Helpful until
-// we have transfer functions implemented.
-extern "C" void mcuMemHostRegisterPtr(void *ptr, int32_t flags) {
-  reportErrorIfAny(cuMemHostRegister(ptr, sizeof(void *), flags),
-                   "MemHostRegister");
+extern "C" void
+mcuMemHostRegisterMemRef1dFloat(const MemRefType<float, 1> *arg) {
+  mcuMemHostRegisterMemRef(arg, 1.23f);
+}
+extern "C" void
+mcuMemHostRegisterMemRef3dFloat(const MemRefType<float, 3> *arg) {
+  mcuMemHostRegisterMemRef(arg, 1.23f);
 }
 
 /// Prints the given float array to stderr.
author	Christian Sigg <csigg@google.com>	2019-11-19 13:12:19 -0800
committer	A. Unique TensorFlower <gardener@tensorflow.org>	2019-11-19 13:13:02 -0800
commit	f868adafee91a8c3ebee1e052d5fdfff7be0afd0 (patch)
tree	34ef3b730148fa01a2979dac18c0dbead9ad9a86
parent	ee95f6f2594e9089990024208d01634fd81d2da2 (diff)
download	bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.tar.gz bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.zip