summaryrefslogtreecommitdiffstats
path: root/parallel-libs/streamexecutor/lib/platforms
diff options
context:
space:
mode:
Diffstat (limited to 'parallel-libs/streamexecutor/lib/platforms')
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt3
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt5
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp65
-rw-r--r--parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp307
4 files changed, 0 insertions, 380 deletions
diff --git a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
deleted file mode 100644
index 0802c059add..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
- add_subdirectory(cuda)
-endif()
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
deleted file mode 100644
index 5be76d1c75a..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_library(
- streamexecutor_cuda_platform
- OBJECT
- CUDAPlatform.cpp
- CUDAPlatformDevice.cpp)
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
deleted file mode 100644
index 9f9e4388647..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Implementation of CUDA platform internals.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
-#include "streamexecutor/Device.h"
-#include "streamexecutor/Platform.h"
-#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
-
-#include "llvm/Support/Mutex.h"
-
-#include "cuda.h"
-
-#include <map>
-
-namespace streamexecutor {
-namespace cuda {
-
-static CUresult ensureCUDAInitialized() {
- static CUresult InitResult = []() { return cuInit(0); }();
- return InitResult;
-}
-
-size_t CUDAPlatform::getDeviceCount() const {
- if (ensureCUDAInitialized())
- // TODO(jhen): Log an error.
- return 0;
-
- int DeviceCount = 0;
- CUresult Result = cuDeviceGetCount(&DeviceCount);
- (void)Result;
- // TODO(jhen): Log an error.
-
- return DeviceCount;
-}
-
-Expected<Device> CUDAPlatform::getDevice(size_t DeviceIndex) {
- if (CUresult InitResult = ensureCUDAInitialized())
- return CUresultToError(InitResult, "cached cuInit return value");
-
- llvm::sys::ScopedLock Lock(Mutex);
- auto Iterator = PlatformDevices.find(DeviceIndex);
- if (Iterator == PlatformDevices.end()) {
- if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) {
- Iterator =
- PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first;
- } else {
- return MaybePDevice.takeError();
- }
- }
- return Device(&Iterator->second);
-}
-
-} // namespace cuda
-} // namespace streamexecutor
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
deleted file mode 100644
index 5284a9a0a35..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Implementation of CUDAPlatformDevice.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
-#include "streamexecutor/PlatformDevice.h"
-
-#include "cuda.h"
-
-namespace streamexecutor {
-namespace cuda {
-
-static void *offset(const void *Base, size_t Offset) {
- return const_cast<char *>(static_cast<const char *>(Base) + Offset);
-}
-
-Error CUresultToError(int CUResult, const llvm::Twine &Message) {
- CUresult Result = static_cast<CUresult>(CUResult);
- if (Result) {
- const char *ErrorName;
- if (cuGetErrorName(Result, &ErrorName))
- ErrorName = "UNKNOWN ERROR NAME";
- const char *ErrorString;
- if (cuGetErrorString(Result, &ErrorString))
- ErrorString = "UNKNOWN ERROR DESCRIPTION";
- return make_error("CUDA driver error: '" + Message + "', error code = " +
- llvm::Twine(static_cast<int>(Result)) + ", name = " +
- ErrorName + ", description = '" + ErrorString + "'");
- } else
- return Error::success();
-}
-
-std::string CUDAPlatformDevice::getName() const {
- static std::string CachedName = [](int DeviceIndex) {
- static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
- std::string Name = "CUDA device " + std::to_string(DeviceIndex);
- char NameFromDriver[MAX_DRIVER_NAME_BYTES];
- if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
- DeviceIndex)) {
- NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
- Name.append(": ").append(NameFromDriver);
- }
- return Name;
- }(DeviceIndex);
- return CachedName;
-}
-
-Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
- CUdevice DeviceHandle;
- if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
- return CUresultToError(Result, "cuDeviceGet");
-
- CUcontext ContextHandle;
- if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
- return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
-
- if (CUresult Result = cuCtxSetCurrent(ContextHandle))
- return CUresultToError(Result, "cuCtxSetCurrent");
-
- return CUDAPlatformDevice(DeviceIndex);
-}
-
-CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
- : DeviceIndex(Other.DeviceIndex) {
- Other.DeviceIndex = -1;
-}
-
-CUDAPlatformDevice &CUDAPlatformDevice::
-operator=(CUDAPlatformDevice &&Other) noexcept {
- DeviceIndex = Other.DeviceIndex;
- Other.DeviceIndex = -1;
- return *this;
-}
-
-CUDAPlatformDevice::~CUDAPlatformDevice() {
- CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
- (void)Result;
- // TODO(jhen): Log error.
-}
-
-Expected<const void *>
-CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
- // TODO(jhen): Maybe first check loaded modules?
- if (!Spec.hasCUDAPTXInMemory())
- return make_error("no CUDA code available to create kernel");
-
- CUdevice Device = static_cast<int>(DeviceIndex);
- int ComputeCapabilityMajor = 0;
- int ComputeCapabilityMinor = 0;
- if (CUresult Result = cuDeviceGetAttribute(
- &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- Device))
- return CUresultToError(
- Result,
- "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
- if (CUresult Result = cuDeviceGetAttribute(
- &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- Device))
- return CUresultToError(
- Result,
- "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
- const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
- ComputeCapabilityMinor);
-
- if (!Code)
- return make_error("no suitable CUDA source found for compute capability " +
- llvm::Twine(ComputeCapabilityMajor) + "." +
- llvm::Twine(ComputeCapabilityMinor));
-
- CUmodule Module;
- if (CUresult Result = cuModuleLoadData(&Module, Code))
- return CUresultToError(Result, "cuModuleLoadData");
-
- CUfunction Function;
- if (CUresult Result =
- cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
- return CUresultToError(Result, "cuModuleGetFunction");
-
- // TODO(jhen): Should I save this function pointer in case someone asks for
- // it again?
-
- // TODO(jhen): Should I save the module pointer so I can unload it when I
- // destroy this device?
-
- return static_cast<const void *>(Function);
-}
-
-Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
- // TODO(jhen): Maybe keep track of kernels for each module and unload the
- // module after they are all destroyed.
- return Error::success();
-}
-
-Expected<const void *> CUDAPlatformDevice::createStream() {
- CUstream Stream;
- if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
- return CUresultToError(Result, "cuStreamCreate");
- return Stream;
-}
-
-Error CUDAPlatformDevice::destroyStream(const void *Handle) {
- return CUresultToError(
- cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
- "cuStreamDestroy");
-}
-
-Error CUDAPlatformDevice::launch(
- const void *PlatformStreamHandle, BlockDimensions BlockSize,
- GridDimensions GridSize, const void *PKernelHandle,
- const PackedKernelArgumentArrayBase &ArgumentArray) {
- CUfunction Function =
- reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
- CUstream Stream =
- reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
-
- auto Launch = [Function, Stream, BlockSize,
- GridSize](size_t SharedMemoryBytes, void **ArgumentAddresses) {
- return CUresultToError(
- cuLaunchKernel(Function, //
- GridSize.X, GridSize.Y, GridSize.Z, //
- BlockSize.X, BlockSize.Y, BlockSize.Z, //
- SharedMemoryBytes, Stream, ArgumentAddresses, nullptr),
- "cuLaunchKernel");
- };
-
- void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
- size_t SharedArgumentCount = ArgumentArray.getSharedCount();
- if (SharedArgumentCount) {
- // The argument handling in this case is not very efficient. We may need to
- // come back and optimize it later.
- //
- // Perhaps introduce another branch for the case where there is exactly one
- // shared memory argument and it is the first one. This is the only case
- // that will be used for compiler-generated CUDA kernels, and OpenCL users
- // can choose to take advantage of it by combining their dynamic shared
- // memory arguments and putting them first in the kernel signature.
- unsigned SharedMemoryBytes = 0;
- size_t ArgumentCount = ArgumentArray.getArgumentCount();
- llvm::SmallVector<void *, 16> NonSharedArgumentAddresses(
- ArgumentCount - SharedArgumentCount);
- size_t NonSharedIndex = 0;
- for (size_t I = 0; I < ArgumentCount; ++I)
- if (ArgumentArray.getType(I) == KernelArgumentType::SHARED_DEVICE_MEMORY)
- SharedMemoryBytes += ArgumentArray.getSize(I);
- else
- NonSharedArgumentAddresses[NonSharedIndex++] = ArgumentAddresses[I];
- return Launch(SharedMemoryBytes, NonSharedArgumentAddresses.data());
- }
- return Launch(0, ArgumentAddresses);
-}
-
-Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
- const void *DeviceSrcHandle,
- size_t SrcByteOffset, void *HostDst,
- size_t DstByteOffset, size_t ByteCount) {
- return CUresultToError(
- cuMemcpyDtoHAsync(
- offset(HostDst, DstByteOffset),
- reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
- ByteCount,
- static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
- "cuMemcpyDtoHAsync");
-}
-
-Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
- const void *HostSrc, size_t SrcByteOffset,
- const void *DeviceDstHandle,
- size_t DstByteOffset, size_t ByteCount) {
- return CUresultToError(
- cuMemcpyHtoDAsync(
- reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
- offset(HostSrc, SrcByteOffset), ByteCount,
- static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
- "cuMemcpyHtoDAsync");
-}
-
-Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
- const void *DeviceSrcHandle,
- size_t SrcByteOffset,
- const void *DeviceDstHandle,
- size_t DstByteOffset, size_t ByteCount) {
- return CUresultToError(
- cuMemcpyDtoDAsync(
- reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
- reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
- ByteCount,
- static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
- "cuMemcpyDtoDAsync");
-}
-
-Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
- return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
- const_cast<void *>(PlatformStreamHandle))),
- "cuStreamSynchronize");
-}
-
-Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
- CUdeviceptr Pointer;
- if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
- return CUresultToError(Result, "cuMemAlloc");
- return reinterpret_cast<void *>(Pointer);
-}
-
-Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
- return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
- "cuMemFree");
-}
-
-Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
- return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
- "cuMemHostRegister");
-}
-
-Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
- return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
- "cuMemHostUnregister");
-}
-
-Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
- size_t SrcByteOffset,
- void *HostDst,
- size_t DstByteOffset,
- size_t ByteCount) {
- return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
- reinterpret_cast<CUdeviceptr>(offset(
- DeviceSrcHandle, SrcByteOffset)),
- ByteCount),
- "cuMemcpyDtoH");
-}
-
-Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
- size_t SrcByteOffset,
- const void *DeviceDstHandle,
- size_t DstByteOffset,
- size_t ByteCount) {
- return CUresultToError(
- cuMemcpyHtoD(
- reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
- offset(HostSrc, SrcByteOffset), ByteCount),
- "cuMemcpyHtoD");
-}
-
-Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
- size_t DstByteOffset,
- const void *DeviceSrcHandle,
- size_t SrcByteOffset,
- size_t ByteCount) {
- return CUresultToError(
- cuMemcpyDtoD(
- reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
- reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
- ByteCount),
- "cuMemcpyDtoD");
-}
-
-} // namespace cuda
-} // namespace streamexecutor
OpenPOWER on IntegriCloud