diff options
| author | Christian Sigg <csigg@google.com> | 2019-11-19 13:12:19 -0800 |
|---|---|---|
| committer | A. Unique TensorFlower <gardener@tensorflow.org> | 2019-11-19 13:13:02 -0800 |
| commit | f868adafee91a8c3ebee1e052d5fdfff7be0afd0 (patch) | |
| tree | 34ef3b730148fa01a2979dac18c0dbead9ad9a86 | |
| parent | ee95f6f2594e9089990024208d01634fd81d2da2 (diff) | |
| download | bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.tar.gz bcm5719-llvm-f868adafee91a8c3ebee1e052d5fdfff7be0afd0.zip | |
Make type and rank explicit in mcuMemHostRegister function.
Fix registered size of indirect MemRefType kernel arguments.
PiperOrigin-RevId: 281362940
5 files changed, 40 insertions, 35 deletions
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index d9332428425..9d8c8942051 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -49,7 +49,7 @@ static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction"; static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel"; static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; -static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr"; +static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister"; static constexpr const char *kCubinAnnotation = "nvvm.cubin"; static constexpr const char *kCubinStorageSuffix = "_cubin_cst"; @@ -228,13 +228,13 @@ void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { getPointerType() /* CUstream stream */, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kMcuMemHostRegisterPtr)) { + if (!module.lookupSymbol(kMcuMemHostRegister)) { builder.create<LLVM::LLVMFuncOp>( - loc, kMcuMemHostRegisterPtr, + loc, kMcuMemHostRegister, LLVM::LLVMType::getFunctionTy(getVoidType(), { getPointerType(), /* void *ptr */ - getInt32Type() /* int32 flags*/ + getInt64Type() /* int64 sizeBytes*/ }, /*isVarArg=*/false)); } @@ -277,12 +277,14 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp, // the descriptor pointer is registered via @mcuMemHostRegisterPtr if (llvmType.isStructTy()) { auto registerFunc = - getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegisterPtr); - auto zero = builder.create<LLVM::ConstantOp>( - loc, getInt32Type(), builder.getI32IntegerAttr(0)); + getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegister); + auto nullPtr = builder.create<LLVM::NullOp>(loc, llvmType.getPointerTo()); + auto gep = builder.create<LLVM::GEPOp>(loc, llvmType.getPointerTo(), + ArrayRef<Value *>{nullPtr, one}); + auto size = builder.create<LLVM::PtrToIntOp>(loc, getInt64Type(), gep); builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{}, builder.getSymbolRefAttr(registerFunc), - ArrayRef<Value *>{casted, zero}); + ArrayRef<Value *>{casted, size}); Value *memLocation = builder.create<LLVM::AllocaOp>( loc, getPointerPointerType(), one, /*alignment=*/1); builder.create<LLVM::StoreOp>(loc, casted, memLocation); diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir index 6356db0791a..c863623a360 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -4,12 +4,11 @@ func @main() { %arg = alloc() : memref<13x4x2xf32> %dst = memref_cast %arg : memref<13x4x2xf32> to memref<?x?x?xf32> - %zero = constant 0 : i32 %one = constant 1 : index %sx = dim %dst, 0 : memref<?x?x?xf32> %sy = dim %dst, 1 : memref<?x?x?xf32> %sz = dim %dst, 2 : memref<?x?x?xf32> - call @mcuMemHostRegister(%dst, %zero) : (memref<?x?x?xf32>, i32) -> () + call @mcuMemHostRegisterMemRef3dFloat(%dst) : (memref<?x?x?xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) args(%kernel_dst = %dst) : memref<?x?x?xf32> { @@ -27,5 +26,5 @@ func @main() { return } -func @mcuMemHostRegister(%ptr : memref<?x?x?xf32>, %flags : i32) +func @mcuMemHostRegisterMemRef3dFloat(%ptr : memref<?x?x?xf32>) func @mcuPrintFloat(%ptr : memref<?x?x?xf32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir index 1a3a4595d13..5cb803cfb7d 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -4,10 +4,9 @@ func @main() { %arg = alloc() : memref<35xf32> %dst = memref_cast %arg : memref<35xf32> to memref<?xf32> - %zero = constant 0 : i32 %one = constant 1 : index %sx = dim %dst, 0 : memref<?xf32> - call @mcuMemHostRegister(%dst, %zero) : (memref<?xf32>, i32) -> () + call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref<?xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) args(%kernel_dst = %dst) : memref<?xf32> { @@ -25,5 +24,5 @@ func @main() { return } -func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32) +func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>) func @mcuPrintFloat(%ptr : memref<?xf32>) diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir index 871327ff819..98e0832d5ba 100644 --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -15,10 +15,9 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) { // CHECK: [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00] func @main() { %arg0 = alloc() : memref<5xf32> - %20 = constant 0 : i32 %21 = constant 5 : i32 %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32> - call @mcuMemHostRegister(%22, %20) : (memref<?xf32>, i32) -> () + call @mcuMemHostRegisterMemRef1dFloat(%22) : (memref<?xf32>) -> () call @mcuPrintFloat(%22) : (memref<?xf32>) -> () %24 = constant 1.0 : f32 call @other_func(%24, %22) : (f32, memref<?xf32>) -> () @@ -26,5 +25,5 @@ func @main() { return } -func @mcuMemHostRegister(%ptr : memref<?xf32>, %flags : i32) +func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>) func @mcuPrintFloat(%ptr : memref<?xf32>) diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp index 31b6f6f6209..ac772589282 100644 --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -21,8 +21,8 @@ // //===----------------------------------------------------------------------===// -#include <assert.h> -#include <memory.h> +#include <cassert> +#include <numeric> #include "llvm/Support/raw_ostream.h" @@ -80,6 +80,13 @@ extern "C" int32_t mcuStreamSynchronize(void *stream) { /// Helper functions for writing mlir example code +// Allows to register byte array with the CUDA runtime. Helpful until we have +// transfer functions implemented. +extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) { + reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0), + "MemHostRegister"); +} + // A struct that corresponds to how MLIR represents memrefs. template <typename T, int N> struct MemRefType { T *basePtr; @@ -89,23 +96,22 @@ template <typename T, int N> struct MemRefType { int64_t strides[N]; }; -// Allows to register a pointer with the CUDA runtime. Helpful until -// we have transfer functions implemented. -extern "C" void mcuMemHostRegister(const MemRefType<float, 1> *arg, - int32_t flags) { - reportErrorIfAny( - cuMemHostRegister(arg->data, arg->sizes[0] * sizeof(float), flags), - "MemHostRegister"); - for (int pos = 0; pos < arg->sizes[0]; pos++) { - arg->data[pos] = 1.23f; - } +// Allows to register a MemRef with the CUDA runtime. Initializes array with +// value. Helpful until we have transfer functions implemented. +template <typename T, int N> +void mcuMemHostRegisterMemRef(const MemRefType<T, N> *arg, T value) { + auto count = std::accumulate(arg->sizes, arg->sizes + N, 1, + std::multiplies<int64_t>()); + std::fill_n(arg->data, count, value); + mcuMemHostRegister(arg->data, count * sizeof(T)); } - -// Allows to register a pointer with the CUDA runtime. Helpful until -// we have transfer functions implemented. -extern "C" void mcuMemHostRegisterPtr(void *ptr, int32_t flags) { - reportErrorIfAny(cuMemHostRegister(ptr, sizeof(void *), flags), - "MemHostRegister"); +extern "C" void +mcuMemHostRegisterMemRef1dFloat(const MemRefType<float, 1> *arg) { + mcuMemHostRegisterMemRef(arg, 1.23f); +} +extern "C" void +mcuMemHostRegisterMemRef3dFloat(const MemRefType<float, 3> *arg) { + mcuMemHostRegisterMemRef(arg, 1.23f); } /// Prints the given float array to stderr. |

