summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristian Sigg <csigg@google.com>2019-11-19 13:12:19 -0800
committerA. Unique TensorFlower <gardener@tensorflow.org>2019-11-19 13:13:02 -0800
commitf868adafee91a8c3ebee1e052d5fdfff7be0afd0 (patch)
tree34ef3b730148fa01a2979dac18c0dbead9ad9a86
parentee95f6f2594e9089990024208d01634fd81d2da2 (diff)
downloadbcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.tar.gz
bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.zip
Make type and rank explicit in mcuMemHostRegister function.
Fix registered size of indirect MemRefType kernel arguments. PiperOrigin-RevId: 281362940
-rw-r--r--mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp18
-rw-r--r--mlir/test/mlir-cuda-runner/all-reduce-op.mlir5
-rw-r--r--mlir/test/mlir-cuda-runner/all-reduce-region.mlir5
-rw-r--r--mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir5
-rw-r--r--mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp42
5 files changed, 40 insertions, 35 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index d9332428425..9d8c8942051 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -49,7 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
-static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
+static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
@@ -228,13 +228,13 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
getPointerType() /* CUstream stream */,
/*isVarArg=*/false));
}
- if (!module.lookupSymbol(kMcuMemHostRegisterPtr)) {
+ if (!module.lookupSymbol(kMcuMemHostRegister)) {
builder.create<LLVM::LLVMFuncOp>(
- loc, kMcuMemHostRegisterPtr,
+ loc, kMcuMemHostRegister,
LLVM::LLVMType::getFunctionTy(getVoidType(),
{
getPointerType(), /* void *ptr */
- getInt32Type() /* int32 flags*/
+ getInt64Type() /* int64 sizeBytes*/
},
/*isVarArg=*/false));
}
@@ -277,12 +277,14 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
// the descriptor pointer is registered via @mcuMemHostRegisterPtr
if (llvmType.isStructTy()) {
auto registerFunc =
- getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegisterPtr);
- auto zero = builder.create<LLVM::ConstantOp>(
- loc, getInt32Type(), builder.getI32IntegerAttr(0));
+ getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegister);
+ auto nullPtr = builder.create<LLVM::NullOp>(loc, llvmType.getPointerTo());
+ auto gep = builder.create<LLVM::GEPOp>(loc, llvmType.getPointerTo(),
+ ArrayRef<Value *>{nullPtr, one});
+ auto size = builder.create<LLVM::PtrToIntOp>(loc, getInt64Type(), gep);
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
builder.getSymbolRefAttr(registerFunc),
- ArrayRef<Value *>{casted, zero});
+ ArrayRef<Value *>{casted, size});
Value *memLocation = builder.create<LLVM::AllocaOp>(
loc, getPointerPointerType(), one, /*alignment=*/1);
builder.create<LLVM::StoreOp>(loc, casted, memLocation);
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
index 6356db0791a..c863623a360 100644
--- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
@@ -4,12 +4,11 @@
func @main() {
%arg = alloc() : memref<13x4x2xf32>
%dst = memref_cast %arg : memref<13x4x2xf32> to memref<?x?x?xf32>
- %zero = constant 0 : i32
%one = constant 1 : index
%sx = dim %dst, 0 : memref<?x?x?xf32>
%sy = dim %dst, 1 : memref<?x?x?xf32>
%sz = dim %dst, 2 : memref<?x?x?xf32>
- call @mcuMemHostRegister(%dst, %zero) : (memref<?x?x?xf32>, i32) -> ()
+ call @mcuMemHostRegisterMemRef3dFloat(%dst) : (memref<?x?x?xf32>) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz)
args(%kernel_dst = %dst) : memref<?x?x?xf32> {
@@ -27,5 +26,5 @@ func @main() {
return
}
-func @mcuMemHostRegister(%ptr : memref<?x?x?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef3dFloat(%ptr : memref<?x?x?xf32>)
func @mcuPrintFloat(%ptr : memref<?x?x?xf32>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
index 1a3a4595d13..5cb803cfb7d 100644
--- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
@@ -4,10 +4,9 @@
func @main() {
%arg = alloc() : memref<35xf32>
%dst = memref_cast %arg : memref<35xf32> to memref<?xf32>
- %zero = constant 0 : i32
%one = constant 1 : index
%sx = dim %dst, 0 : memref<?xf32>
- call @mcuMemHostRegister(%dst, %zero) : (memref<?xf32>, i32) -> ()
+ call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref<?xf32>) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one)
args(%kernel_dst = %dst) : memref<?xf32> {
@@ -25,5 +24,5 @@ func @main() {
return
}
-func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
func @mcuPrintFloat(%ptr : memref<?xf32>)
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
index 871327ff819..98e0832d5ba 100644
--- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
+++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
@@ -15,10 +15,9 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
// CHECK: [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00]
func @main() {
%arg0 = alloc() : memref<5xf32>
- %20 = constant 0 : i32
%21 = constant 5 : i32
%22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
- call @mcuMemHostRegister(%22, %20) : (memref<?xf32>, i32) -> ()
+ call @mcuMemHostRegisterMemRef1dFloat(%22) : (memref<?xf32>) -> ()
call @mcuPrintFloat(%22) : (memref<?xf32>) -> ()
%24 = constant 1.0 : f32
call @other_func(%24, %22) : (f32, memref<?xf32>) -> ()
@@ -26,5 +25,5 @@ func @main() {
return
}
-func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32)
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
func @mcuPrintFloat(%ptr : memref<?xf32>)
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
index 31b6f6f6209..ac772589282 100644
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -21,8 +21,8 @@
//
//===----------------------------------------------------------------------===//
-#include <assert.h>
-#include <memory.h>
+#include <cassert>
+#include <numeric>
#include "llvm/Support/raw_ostream.h"
@@ -80,6 +80,13 @@ extern "C" int32_t mcuStreamSynchronize(void *stream) {
/// Helper functions for writing mlir example code
+// Allows to register byte array with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+ reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
+ "MemHostRegister");
+}
+
// A struct that corresponds to how MLIR represents memrefs.
template <typename T, int N> struct MemRefType {
T *basePtr;
@@ -89,23 +96,22 @@ template <typename T, int N> struct MemRefType {
int64_t strides[N];
};
-// Allows to register a pointer with the CUDA runtime. Helpful until
-// we have transfer functions implemented.
-extern "C" void mcuMemHostRegister(const MemRefType<float, 1> *arg,
- int32_t flags) {
- reportErrorIfAny(
- cuMemHostRegister(arg->data, arg->sizes[0] * sizeof(float), flags),
- "MemHostRegister");
- for (int pos = 0; pos < arg->sizes[0]; pos++) {
- arg->data[pos] = 1.23f;
- }
+// Allows to register a MemRef with the CUDA runtime. Initializes array with
+// value. Helpful until we have transfer functions implemented.
+template <typename T, int N>
+void mcuMemHostRegisterMemRef(const MemRefType<T, N> *arg, T value) {
+ auto count = std::accumulate(arg->sizes, arg->sizes + N, 1,
+ std::multiplies<int64_t>());
+ std::fill_n(arg->data, count, value);
+ mcuMemHostRegister(arg->data, count * sizeof(T));
}
-
-// Allows to register a pointer with the CUDA runtime. Helpful until
-// we have transfer functions implemented.
-extern "C" void mcuMemHostRegisterPtr(void *ptr, int32_t flags) {
- reportErrorIfAny(cuMemHostRegister(ptr, sizeof(void *), flags),
- "MemHostRegister");
+extern "C" void
+mcuMemHostRegisterMemRef1dFloat(const MemRefType<float, 1> *arg) {
+ mcuMemHostRegisterMemRef(arg, 1.23f);
+}
+extern "C" void
+mcuMemHostRegisterMemRef3dFloat(const MemRefType<float, 3> *arg) {
+ mcuMemHostRegisterMemRef(arg, 1.23f);
}
/// Prints the given float array to stderr.
OpenPOWER on IntegriCloud