4 files changed, 0 insertions, 380 deletions
diff --git a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
deleted file mode 100644
index 0802c059add..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
-    add_subdirectory(cuda)
-endif()
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
deleted file mode 100644
index 5be76d1c75a..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_library(
-    streamexecutor_cuda_platform
-    OBJECT
-    CUDAPlatform.cpp
-    CUDAPlatformDevice.cpp)
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
deleted file mode 100644
index 9f9e4388647..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatform.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Implementation of CUDA platform internals.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
-#include "streamexecutor/Device.h"
-#include "streamexecutor/Platform.h"
-#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
-
-#include "llvm/Support/Mutex.h"
-
-#include "cuda.h"
-
-#include <map>
-
-namespace streamexecutor {
-namespace cuda {
-
-static CUresult ensureCUDAInitialized() {
-  static CUresult InitResult = []() { return cuInit(0); }();
-  return InitResult;
-}
-
-size_t CUDAPlatform::getDeviceCount() const {
-  if (ensureCUDAInitialized())
-    // TODO(jhen): Log an error.
-    return 0;
-
-  int DeviceCount = 0;
-  CUresult Result = cuDeviceGetCount(&DeviceCount);
-  (void)Result;
-  // TODO(jhen): Log an error.
-
-  return DeviceCount;
-}
-
-Expected<Device> CUDAPlatform::getDevice(size_t DeviceIndex) {
-  if (CUresult InitResult = ensureCUDAInitialized())
-    return CUresultToError(InitResult, "cached cuInit return value");
-
-  llvm::sys::ScopedLock Lock(Mutex);
-  auto Iterator = PlatformDevices.find(DeviceIndex);
-  if (Iterator == PlatformDevices.end()) {
-    if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) {
-      Iterator =
-          PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first;
-    } else {
-      return MaybePDevice.takeError();
-    }
-  }
-  return Device(&Iterator->second);
-}
-
-} // namespace cuda
-} // namespace streamexecutor
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp b/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
deleted file mode 100644
index 5284a9a0a35..00000000000
--- a/parallel-libs/streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Implementation of CUDAPlatformDevice.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
-#include "streamexecutor/PlatformDevice.h"
-
-#include "cuda.h"
-
-namespace streamexecutor {
-namespace cuda {
-
-static void *offset(const void *Base, size_t Offset) {
-  return const_cast<char *>(static_cast<const char *>(Base) + Offset);
-}
-
-Error CUresultToError(int CUResult, const llvm::Twine &Message) {
-  CUresult Result = static_cast<CUresult>(CUResult);
-  if (Result) {
-    const char *ErrorName;
-    if (cuGetErrorName(Result, &ErrorName))
-      ErrorName = "UNKNOWN ERROR NAME";
-    const char *ErrorString;
-    if (cuGetErrorString(Result, &ErrorString))
-      ErrorString = "UNKNOWN ERROR DESCRIPTION";
-    return make_error("CUDA driver error: '" + Message + "', error code = " +
-                      llvm::Twine(static_cast<int>(Result)) + ", name = " +
-                      ErrorName + ", description = '" + ErrorString + "'");
-  } else
-    return Error::success();
-}
-
-std::string CUDAPlatformDevice::getName() const {
-  static std::string CachedName = [](int DeviceIndex) {
-    static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
-    std::string Name = "CUDA device " + std::to_string(DeviceIndex);
-    char NameFromDriver[MAX_DRIVER_NAME_BYTES];
-    if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
-                         DeviceIndex)) {
-      NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
-      Name.append(": ").append(NameFromDriver);
-    }
-    return Name;
-  }(DeviceIndex);
-  return CachedName;
-}
-
-Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
-  CUdevice DeviceHandle;
-  if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
-    return CUresultToError(Result, "cuDeviceGet");
-
-  CUcontext ContextHandle;
-  if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
-    return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
-
-  if (CUresult Result = cuCtxSetCurrent(ContextHandle))
-    return CUresultToError(Result, "cuCtxSetCurrent");
-
-  return CUDAPlatformDevice(DeviceIndex);
-}
-
-CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
-    : DeviceIndex(Other.DeviceIndex) {
-  Other.DeviceIndex = -1;
-}
-
-CUDAPlatformDevice &CUDAPlatformDevice::
-operator=(CUDAPlatformDevice &&Other) noexcept {
-  DeviceIndex = Other.DeviceIndex;
-  Other.DeviceIndex = -1;
-  return *this;
-}
-
-CUDAPlatformDevice::~CUDAPlatformDevice() {
-  CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
-  (void)Result;
-  // TODO(jhen): Log error.
-}
-
-Expected<const void *>
-CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
-  // TODO(jhen): Maybe first check loaded modules?
-  if (!Spec.hasCUDAPTXInMemory())
-    return make_error("no CUDA code available to create kernel");
-
-  CUdevice Device = static_cast<int>(DeviceIndex);
-  int ComputeCapabilityMajor = 0;
-  int ComputeCapabilityMinor = 0;
-  if (CUresult Result = cuDeviceGetAttribute(
-          &ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-          Device))
-    return CUresultToError(
-        Result,
-        "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
-  if (CUresult Result = cuDeviceGetAttribute(
-          &ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-          Device))
-    return CUresultToError(
-        Result,
-        "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
-  const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
-                                                       ComputeCapabilityMinor);
-
-  if (!Code)
-    return make_error("no suitable CUDA source found for compute capability " +
-                      llvm::Twine(ComputeCapabilityMajor) + "." +
-                      llvm::Twine(ComputeCapabilityMinor));
-
-  CUmodule Module;
-  if (CUresult Result = cuModuleLoadData(&Module, Code))
-    return CUresultToError(Result, "cuModuleLoadData");
-
-  CUfunction Function;
-  if (CUresult Result =
-          cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
-    return CUresultToError(Result, "cuModuleGetFunction");
-
-  // TODO(jhen): Should I save this function pointer in case someone asks for
-  // it again?
-
-  // TODO(jhen): Should I save the module pointer so I can unload it when I
-  // destroy this device?
-
-  return static_cast<const void *>(Function);
-}
-
-Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
-  // TODO(jhen): Maybe keep track of kernels for each module and unload the
-  // module after they are all destroyed.
-  return Error::success();
-}
-
-Expected<const void *> CUDAPlatformDevice::createStream() {
-  CUstream Stream;
-  if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
-    return CUresultToError(Result, "cuStreamCreate");
-  return Stream;
-}
-
-Error CUDAPlatformDevice::destroyStream(const void *Handle) {
-  return CUresultToError(
-      cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
-      "cuStreamDestroy");
-}
-
-Error CUDAPlatformDevice::launch(
-    const void *PlatformStreamHandle, BlockDimensions BlockSize,
-    GridDimensions GridSize, const void *PKernelHandle,
-    const PackedKernelArgumentArrayBase &ArgumentArray) {
-  CUfunction Function =
-      reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
-  CUstream Stream =
-      reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
-
-  auto Launch = [Function, Stream, BlockSize,
-                 GridSize](size_t SharedMemoryBytes, void **ArgumentAddresses) {
-    return CUresultToError(
-        cuLaunchKernel(Function,                              //
-                       GridSize.X, GridSize.Y, GridSize.Z,    //
-                       BlockSize.X, BlockSize.Y, BlockSize.Z, //
-                       SharedMemoryBytes, Stream, ArgumentAddresses, nullptr),
-        "cuLaunchKernel");
-  };
-
-  void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
-  size_t SharedArgumentCount = ArgumentArray.getSharedCount();
-  if (SharedArgumentCount) {
-    // The argument handling in this case is not very efficient. We may need to
-    // come back and optimize it later.
-    //
-    // Perhaps introduce another branch for the case where there is exactly one
-    // shared memory argument and it is the first one. This is the only case
-    // that will be used for compiler-generated CUDA kernels, and OpenCL users
-    // can choose to take advantage of it by combining their dynamic shared
-    // memory arguments and putting them first in the kernel signature.
-    unsigned SharedMemoryBytes = 0;
-    size_t ArgumentCount = ArgumentArray.getArgumentCount();
-    llvm::SmallVector<void *, 16> NonSharedArgumentAddresses(
-        ArgumentCount - SharedArgumentCount);
-    size_t NonSharedIndex = 0;
-    for (size_t I = 0; I < ArgumentCount; ++I)
-      if (ArgumentArray.getType(I) == KernelArgumentType::SHARED_DEVICE_MEMORY)
-        SharedMemoryBytes += ArgumentArray.getSize(I);
-      else
-        NonSharedArgumentAddresses[NonSharedIndex++] = ArgumentAddresses[I];
-    return Launch(SharedMemoryBytes, NonSharedArgumentAddresses.data());
-  }
-  return Launch(0, ArgumentAddresses);
-}
-
-Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
-                                  const void *DeviceSrcHandle,
-                                  size_t SrcByteOffset, void *HostDst,
-                                  size_t DstByteOffset, size_t ByteCount) {
-  return CUresultToError(
-      cuMemcpyDtoHAsync(
-          offset(HostDst, DstByteOffset),
-          reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
-          ByteCount,
-          static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
-      "cuMemcpyDtoHAsync");
-}
-
-Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
-                                  const void *HostSrc, size_t SrcByteOffset,
-                                  const void *DeviceDstHandle,
-                                  size_t DstByteOffset, size_t ByteCount) {
-  return CUresultToError(
-      cuMemcpyHtoDAsync(
-          reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
-          offset(HostSrc, SrcByteOffset), ByteCount,
-          static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
-      "cuMemcpyHtoDAsync");
-}
-
-Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
-                                  const void *DeviceSrcHandle,
-                                  size_t SrcByteOffset,
-                                  const void *DeviceDstHandle,
-                                  size_t DstByteOffset, size_t ByteCount) {
-  return CUresultToError(
-      cuMemcpyDtoDAsync(
-          reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
-          reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
-          ByteCount,
-          static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
-      "cuMemcpyDtoDAsync");
-}
-
-Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
-  return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
-                             const_cast<void *>(PlatformStreamHandle))),
-                         "cuStreamSynchronize");
-}
-
-Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
-  CUdeviceptr Pointer;
-  if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
-    return CUresultToError(Result, "cuMemAlloc");
-  return reinterpret_cast<void *>(Pointer);
-}
-
-Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
-  return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
-                         "cuMemFree");
-}
-
-Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
-  return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
-                         "cuMemHostRegister");
-}
-
-Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
-  return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
-                         "cuMemHostUnregister");
-}
-
-Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
-                                             size_t SrcByteOffset,
-                                             void *HostDst,
-                                             size_t DstByteOffset,
-                                             size_t ByteCount) {
-  return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
-                                      reinterpret_cast<CUdeviceptr>(offset(
-                                          DeviceSrcHandle, SrcByteOffset)),
-                                      ByteCount),
-                         "cuMemcpyDtoH");
-}
-
-Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
-                                             size_t SrcByteOffset,
-                                             const void *DeviceDstHandle,
-                                             size_t DstByteOffset,
-                                             size_t ByteCount) {
-  return CUresultToError(
-      cuMemcpyHtoD(
-          reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
-          offset(HostSrc, SrcByteOffset), ByteCount),
-      "cuMemcpyHtoD");
-}
-
-Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
-                                             size_t DstByteOffset,
-                                             const void *DeviceSrcHandle,
-                                             size_t SrcByteOffset,
-                                             size_t ByteCount) {
-  return CUresultToError(
-      cuMemcpyDtoD(
-          reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
-          reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
-          ByteCount),
-      "cuMemcpyDtoD");
-}
-
-} // namespace cuda
-} // namespace streamexecutor