summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--polly/CMakeLists.txt10
-rw-r--r--polly/include/polly/CodeGen/PPCGCodeGeneration.h24
-rw-r--r--polly/include/polly/LinkAllPasses.h4
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp113
-rw-r--r--polly/lib/Support/RegisterPasses.cpp21
-rw-r--r--polly/test/GPGPU/cuda-managed-memory-simple.ll4
-rw-r--r--polly/test/GPGPU/size-cast.ll2
-rw-r--r--polly/tools/CMakeLists.txt4
-rw-r--r--polly/tools/GPURuntime/GPUJIT.c1383
-rw-r--r--polly/tools/GPURuntime/GPUJIT.h19
10 files changed, 1428 insertions, 156 deletions
diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt
index c3e066024b9..8a2724ee32f 100644
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@@ -152,9 +152,10 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
if (POLLY_ENABLE_GPGPU_CODEGEN)
- # Do not require CUDA, as GPU code generation test cases can be run without
- # a cuda library.
+ # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
+ # without a CUDA/OpenCL library.
FIND_PACKAGE(CUDA)
+ FIND_PACKAGE(OpenCL)
set(GPU_CODEGEN TRUE)
else(POLLY_ENABLE_GPGPU_CODEGEN)
set(GPU_CODEGEN FALSE)
@@ -163,8 +164,13 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)
# Support GPGPU code generation if the library is available.
if (CUDALIB_FOUND)
+ add_definitions(-DHAS_LIBCUDART)
INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
endif(CUDALIB_FOUND)
+if (OpenCL_FOUND)
+ add_definitions(-DHAS_LIBOPENCL)
+ INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
+endif(OpenCL_FOUND)
option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
if (NOT POLLY_BUNDLED_ISL)
diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
new file mode 100644
index 00000000000..b498326dedf
--- /dev/null
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@@ -0,0 +1,24 @@
+//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Take a scop created by ScopInfo and map it to GPU code using the ppcg
+// GPU mapping strategy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PPCGCODEGENERATION_H
+#define POLLY_PPCGCODEGENERATION_H
+
+/// The GPU Architecture to target.
+enum GPUArch { NVPTX64 };
+
+/// The GPU Runtime implementation to use.
+enum GPURuntime { CUDA, OpenCL };
+
+#endif // POLLY_PPCGCODEGENERATION_H
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index 9d42e4c84ed..8b6e188af9e 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -15,6 +15,7 @@
#ifndef POLLY_LINKALLPASSES_H
#define POLLY_LINKALLPASSES_H
+#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/Config/config.h"
#include "polly/PruneUnprofitable.h"
#include "polly/Simplify.h"
@@ -48,7 +49,8 @@ llvm::Pass *createScopInfoWrapperPassPass();
llvm::Pass *createIslAstInfoPass();
llvm::Pass *createCodeGenerationPass();
#ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass();
+llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
+ GPURuntime Runtime = GPURuntime::CUDA);
#endif
llvm::Pass *createIslScheduleOptimizerPass();
llvm::Pass *createFlattenSchedulePass();
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 7d3d42ab2a8..45e570c90b5 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslNodeBuilder.h"
#include "polly/CodeGen/Utils.h"
@@ -153,9 +154,9 @@ public:
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
- gpu_prog *Prog)
+ gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
: IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
- Prog(Prog) {
+ Prog(Prog), Runtime(Runtime), Arch(Arch) {
getExprBuilder().setIDToSAI(&IDToSAI);
}
@@ -201,6 +202,12 @@ private:
/// The GPU program we generate code for.
gpu_prog *Prog;
+ /// The GPU Runtime implementation to use (OpenCL or CUDA).
+ GPURuntime Runtime;
+
+ /// The GPU Architecture to target.
+ GPUArch Arch;
+
/// Class to free isl_ids.
class IslIdDeleter {
public:
@@ -752,7 +759,17 @@ void GPUNodeBuilder::createCallSynchronizeDevice() {
}
Value *GPUNodeBuilder::createCallInitContext() {
- const char *Name = "polly_initContext";
+ const char *Name;
+
+ switch (Runtime) {
+ case GPURuntime::CUDA:
+ Name = "polly_initContextCUDA";
+ break;
+ case GPURuntime::OpenCL:
+ Name = "polly_initContextCL";
+ break;
+ }
+
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@@ -1028,7 +1045,15 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
void GPUNodeBuilder::createKernelSync() {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+
+ Function *Sync;
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+ break;
+ }
+
Builder.CreateCall(Sync, {});
}
@@ -1434,7 +1459,12 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
GPUModule.get());
- FN->setCallingConv(CallingConv::PTX_Kernel);
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ FN->setCallingConv(CallingConv::PTX_Kernel);
+ break;
+ }
auto Arg = FN->arg_begin();
for (long i = 0; i < Kernel->n_array; i++) {
@@ -1495,12 +1525,19 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
}
void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
- Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
- Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
+ Intrinsic::ID IntrinsicsBID[2];
+ Intrinsic::ID IntrinsicsTID[3];
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
+ IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
- Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
- Intrinsic::nvvm_read_ptx_sreg_tid_y,
- Intrinsic::nvvm_read_ptx_sreg_tid_z};
+ IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
+ IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
+ IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
+ break;
+ }
auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
std::string Name = isl_id_get_name(Id);
@@ -1649,11 +1686,18 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues) {
-
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
GPUModule.reset(new Module(Identifier, Builder.getContext()));
- GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
- GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ if (Runtime == GPURuntime::CUDA)
+ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+ else if (Runtime == GPURuntime::OpenCL)
+ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
+ GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+ break;
+ }
Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@@ -1674,7 +1718,21 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
}
std::string GPUNodeBuilder::createKernelASM() {
- llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+ llvm::Triple GPUTriple;
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ switch (Runtime) {
+ case GPURuntime::CUDA:
+ GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
+ break;
+ case GPURuntime::OpenCL:
+ GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
+ break;
+ }
+ break;
+ }
+
std::string ErrMsg;
auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
@@ -1685,9 +1743,17 @@ std::string GPUNodeBuilder::createKernelASM() {
TargetOptions Options;
Options.UnsafeFPMath = FastMath;
- std::unique_ptr<TargetMachine> TargetM(
- GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
- Options, Optional<Reloc::Model>()));
+
+ std::string subtarget;
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ subtarget = CudaVersion;
+ break;
+ }
+
+ std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
+ GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
SmallString<0> ASMString;
raw_svector_ostream ASMStream(ASMString);
@@ -1739,6 +1805,10 @@ class PPCGCodeGeneration : public ScopPass {
public:
static char ID;
+ GPURuntime Runtime = GPURuntime::CUDA;
+
+ GPUArch Architecture = GPUArch::NVPTX64;
+
/// The scop that is currently processed.
Scop *S;
@@ -2522,7 +2592,7 @@ public:
executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
- StartBlock, Prog);
+ StartBlock, Prog, Runtime, Architecture);
// TODO: Handle LICM
auto SplitBlock = StartBlock->getSinglePredecessor();
@@ -2610,7 +2680,12 @@ public:
char PPCGCodeGeneration::ID = 1;
-Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
+Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
+ PPCGCodeGeneration *generator = new PPCGCodeGeneration();
+ generator->Runtime = Runtime;
+ generator->Architecture = Arch;
+ return generator;
+}
INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
"Polly - Apply PPCG translation to SCOP", false, false)
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 9c8eac03f71..6188b8cdb60 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -23,6 +23,7 @@
#include "polly/Canonicalization.h"
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/CodegenCleanup.h"
+#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/DeLICM.h"
#include "polly/DependenceInfo.h"
#include "polly/FlattenSchedule.h"
@@ -101,6 +102,23 @@ static cl::opt<TargetChoice>
),
cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));
+#ifdef GPU_CODEGEN
+static cl::opt<GPURuntime> GPURuntimeChoice(
+ "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
+ cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
+ "use the CUDA Runtime API"),
+ clEnumValN(GPURuntime::OpenCL, "libopencl",
+ "use the OpenCL Runtime API")),
+ cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<GPUArch>
+ GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
+ cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
+ "target NVIDIA 64-bit architecture")),
+ cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
+ cl::cat(PollyCategory));
+#endif
+
VectorizerChoice polly::PollyVectorizerChoice;
static cl::opt<polly::VectorizerChoice, true> Vectorizer(
"polly-vectorizer", cl::desc("Select the vectorization strategy"),
@@ -309,7 +327,8 @@ void registerPollyPasses(llvm::legacy::PassManagerBase &PM) {
if (Target == TARGET_GPU) {
#ifdef GPU_CODEGEN
- PM.add(polly::createPPCGCodeGenerationPass());
+ PM.add(
+ polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
#endif
} else {
switch (CodeGeneration) {
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll
index 0f5ece14525..4a97ec56ad5 100644
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@@ -35,7 +35,7 @@
; CHECK-NOT: polly_freeDeviceMemory
; CHECK-NOT: polly_allocateMemoryForDevice
-; CHECK: %13 = call i8* @polly_initContext()
+; CHECK: %13 = call i8* @polly_initContextCUDA()
; CHECK-NEXT: %14 = bitcast i32* %A to i8*
; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
@@ -46,7 +46,7 @@
; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
; CHECK-NEXT: store i8* %19, i8** %18
-; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
; CHECK-NEXT: call void @polly_freeKernel(i8* %20)
; CHECK-NEXT: call void @polly_synchronizeDevice()
diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll
index 9cb5df46d23..59caf1260ba 100644
--- a/polly/test/GPGPU/size-cast.ll
+++ b/polly/test/GPGPU/size-cast.ll
@@ -29,7 +29,7 @@
; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
-; IR: call i8* @polly_initContext()
+; IR: call i8* @polly_initContextCUDA()
; IR-NEXT: sext i32 %arg to i64
; IR-NEXT: mul i64
; IR-NEXT: @polly_allocateMemoryForDevice
diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt
index 4ce60e1a3e8..8e5ce59cb90 100644
--- a/polly/tools/CMakeLists.txt
+++ b/polly/tools/CMakeLists.txt
@@ -1,5 +1,5 @@
-if (CUDALIB_FOUND)
+if (CUDALIB_FOUND OR OpenCL_FOUND)
add_subdirectory(GPURuntime)
-endif (CUDALIB_FOUND)
+endif (CUDALIB_FOUND OR OpenCL_FOUND)
set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index 457a7477d62..80f4c430903 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -12,8 +12,20 @@
/******************************************************************************/
#include "GPUJIT.h"
+
+#ifdef HAS_LIBCUDART
#include <cuda.h>
#include <cuda_runtime.h>
+#endif /* HAS_LIBCUDART */
+
+#ifdef HAS_LIBOPENCL
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif /* HAS_LIBOPENCL */
+
#include <dlfcn.h>
#include <stdarg.h>
#include <stdio.h>
@@ -22,6 +34,8 @@
static int DebugMode;
static int CacheMode;
+static PollyGPURuntime Runtime = RUNTIME_NONE;
+
static void debug_print(const char *format, ...) {
if (!DebugMode)
return;
@@ -33,18 +47,853 @@ static void debug_print(const char *format, ...) {
}
#define dump_function() debug_print("-> %s\n", __func__)
-/* Define Polly's GPGPU data types. */
+#define KERNEL_CACHE_SIZE 10
+
+static void err_runtime() {
+ fprintf(stderr, "Runtime not correctly initialized.\n");
+ exit(-1);
+}
+
struct PollyGPUContextT {
- CUcontext Cuda;
+ void *Context;
};
struct PollyGPUFunctionT {
+ void *Kernel;
+};
+
+struct PollyGPUDevicePtrT {
+ void *DevicePtr;
+};
+
+/******************************************************************************/
+/* OpenCL */
+/******************************************************************************/
+#ifdef HAS_LIBOPENCL
+
+struct OpenCLContextT {
+ cl_context Context;
+ cl_command_queue CommandQueue;
+};
+
+struct OpenCLKernelT {
+ cl_kernel Kernel;
+ cl_program Program;
+ const char *BinaryString;
+};
+
+struct OpenCLDevicePtrT {
+ cl_mem MemObj;
+};
+
+/* Dynamic library handles for the OpenCL runtime library. */
+static void *HandleOpenCL;
+
+/* Type-defines of function pointer to OpenCL Runtime API. */
+typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
+ cl_platform_id *Platforms,
+ cl_uint *NumPlatforms);
+static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
+
+typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
+ cl_device_type DeviceType,
+ cl_uint NumEntries, cl_device_id *Devices,
+ cl_uint *NumDevices);
+static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
+
+typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
+ cl_device_info ParamName,
+ size_t ParamValueSize, void *ParamValue,
+ size_t *ParamValueSizeRet);
+static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
+
+typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
+ size_t ParamValueSize, void *ParamValue,
+ size_t *ParamValueSizeRet);
+static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
+
+typedef cl_context clCreateContextFcnTy(
+ const cl_context_properties *Properties, cl_uint NumDevices,
+ const cl_device_id *Devices,
+ void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
+ size_t CB, void *UserData),
+ void *UserData, cl_int *ErrcodeRet);
+static clCreateContextFcnTy *clCreateContextFcnPtr;
+
+typedef cl_command_queue
+clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
+ cl_command_queue_properties Properties,
+ cl_int *ErrcodeRet);
+static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
+
+typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
+ size_t Size, void *HostPtr,
+ cl_int *ErrcodeRet);
+static clCreateBufferFcnTy *clCreateBufferFcnPtr;
+
+typedef cl_int
+clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
+ cl_bool BlockingWrite, size_t Offset, size_t Size,
+ const void *Ptr, cl_uint NumEventsInWaitList,
+ const cl_event *EventWaitList, cl_event *Event);
+static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
+
+typedef cl_program clCreateProgramWithBinaryFcnTy(
+ cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
+ const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
+ cl_int *ErrcodeRet);
+static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
+
+typedef cl_int clBuildProgramFcnTy(
+ cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
+ const char *Options,
+ void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
+ void *UserData);
+static clBuildProgramFcnTy *clBuildProgramFcnPtr;
+
+typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
+ const char *KernelName,
+ cl_int *ErrcodeRet);
+static clCreateKernelFcnTy *clCreateKernelFcnPtr;
+
+typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
+ size_t ArgSize, const void *ArgValue);
+static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
+
+typedef cl_int clEnqueueNDRangeKernelFcnTy(
+ cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
+ const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+ const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
+ const cl_event *EventWaitList, cl_event *Event);
+static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
+
+typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
+ cl_mem Buffer, cl_bool BlockingRead,
+ size_t Offset, size_t Size, void *Ptr,
+ cl_uint NumEventsInWaitList,
+ const cl_event *EventWaitList,
+ cl_event *Event);
+static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
+
+typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
+static clFlushFcnTy *clFlushFcnPtr;
+
+typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
+static clFinishFcnTy *clFinishFcnPtr;
+
+typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
+static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
+
+typedef cl_int clReleaseProgramFcnTy(cl_program Program);
+static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
+
+typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
+static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
+
+typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
+static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
+
+typedef cl_int clReleaseContextFcnTy(cl_context Context);
+static clReleaseContextFcnTy *clReleaseContextFcnPtr;
+
+static void *getAPIHandleCL(void *Handle, const char *FuncName) {
+ char *Err;
+ void *FuncPtr;
+ dlerror();
+ FuncPtr = dlsym(Handle, FuncName);
+ if ((Err = dlerror()) != 0) {
+ fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
+ return 0;
+ }
+ return FuncPtr;
+}
+
+static int initialDeviceAPILibrariesCL() {
+ HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
+ if (!HandleOpenCL) {
+ fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+ return 0;
+ }
+ return 1;
+}
+
+static int initialDeviceAPIsCL() {
+ if (initialDeviceAPILibrariesCL() == 0)
+ return 0;
+
+ /* Get function pointer to OpenCL Runtime API.
+ *
+ * Note that compilers conforming to the ISO C standard are required to
+ * generate a warning if a conversion from a void * pointer to a function
+ * pointer is attempted as in the following statements. The warning
+ * of this kind of cast may not be emitted by clang and new versions of gcc
+ * as it is valid on POSIX 2008.
+ */
+ clGetPlatformIDsFcnPtr =
+ (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
+
+ clGetDeviceIDsFcnPtr =
+ (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
+
+ clGetDeviceInfoFcnPtr =
+ (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
+
+ clGetKernelInfoFcnPtr =
+ (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
+
+ clCreateContextFcnPtr =
+ (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
+
+ clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clCreateCommandQueue");
+
+ clCreateBufferFcnPtr =
+ (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
+
+ clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clEnqueueWriteBuffer");
+
+ clCreateProgramWithBinaryFcnPtr =
+ (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clCreateProgramWithBinary");
+
+ clBuildProgramFcnPtr =
+ (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
+
+ clCreateKernelFcnPtr =
+ (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
+
+ clSetKernelArgFcnPtr =
+ (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
+
+ clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clEnqueueNDRangeKernel");
+
+ clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clEnqueueReadBuffer");
+
+ clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
+
+ clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
+
+ clReleaseKernelFcnPtr =
+ (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
+
+ clReleaseProgramFcnPtr =
+ (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
+
+ clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clReleaseMemObject");
+
+ clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
+ HandleOpenCL, "clReleaseCommandQueue");
+
+ clReleaseContextFcnPtr =
+ (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
+
+ return 1;
+}
+
+/* Context and Device. */
+static PollyGPUContext *GlobalContext = NULL;
+static cl_device_id GlobalDeviceID = NULL;
+
+/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
+static void printOpenCLError(int Error);
+
+static void checkOpenCLError(int Ret, const char *format, ...) {
+ if (Ret == CL_SUCCESS)
+ return;
+
+ printOpenCLError(Ret);
+ va_list args;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ exit(-1);
+}
+
+static PollyGPUContext *initContextCL() {
+ dump_function();
+
+ PollyGPUContext *Context;
+
+ cl_platform_id PlatformID = NULL;
+ cl_device_id DeviceID = NULL;
+ cl_uint NumDevicesRet;
+ cl_int Ret;
+
+ char DeviceRevision[256];
+ char DeviceName[256];
+ size_t DeviceRevisionRetSize, DeviceNameRetSize;
+
+ static __thread PollyGPUContext *CurrentContext = NULL;
+
+ if (CurrentContext)
+ return CurrentContext;
+
+ /* Get API handles. */
+ if (initialDeviceAPIsCL() == 0) {
+ fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
+ exit(-1);
+ }
+
+ /* Get number of devices that support OpenCL. */
+ static const int NumberOfPlatforms = 1;
+ Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
+ checkOpenCLError(Ret, "Failed to get platform IDs.\n");
+ // TODO: Extend to CL_DEVICE_TYPE_ALL?
+ static const int NumberOfDevices = 1;
+ Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
+ &DeviceID, &NumDevicesRet);
+ checkOpenCLError(Ret, "Failed to get device IDs.\n");
+
+ GlobalDeviceID = DeviceID;
+ if (NumDevicesRet == 0) {
+ fprintf(stderr, "There is no device supporting OpenCL.\n");
+ exit(-1);
+ }
+
+ /* Get device revision. */
+ Ret =
+ clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
+ DeviceRevision, &DeviceRevisionRetSize);
+ checkOpenCLError(Ret, "Failed to fetch device revision.\n");
+
+ /* Get device name. */
+ Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
+ DeviceName, &DeviceNameRetSize);
+ checkOpenCLError(Ret, "Failed to fetch device name.\n");
+
+ debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
+
+ /* Create context on the device. */
+ Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
+ if (Context == 0) {
+ fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
+ exit(-1);
+ }
+ Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
+ if (Context->Context == 0) {
+ fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
+ exit(-1);
+ }
+ ((OpenCLContext *)Context->Context)->Context =
+ clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
+ checkOpenCLError(Ret, "Failed to create context.\n");
+
+ static const int ExtraProperties = 0;
+ ((OpenCLContext *)Context->Context)->CommandQueue =
+ clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
+ DeviceID, ExtraProperties, &Ret);
+ checkOpenCLError(Ret, "Failed to create command queue.\n");
+
+ if (CacheMode)
+ CurrentContext = Context;
+
+ GlobalContext = Context;
+ return Context;
+}
+
+static void freeKernelCL(PollyGPUFunction *Kernel) {
+ dump_function();
+
+ if (CacheMode)
+ return;
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ cl_int Ret;
+ Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
+ checkOpenCLError(Ret, "Failed to flush command queue.\n");
+ Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
+ checkOpenCLError(Ret, "Failed to finish command queue.\n");
+
+ if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
+ cl_int Ret =
+ clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
+ checkOpenCLError(Ret, "Failed to release kernel.\n");
+ }
+
+ if (((OpenCLKernel *)Kernel->Kernel)->Program) {
+ cl_int Ret =
+ clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
+ checkOpenCLError(Ret, "Failed to release program.\n");
+ }
+
+ if (Kernel->Kernel)
+ free((OpenCLKernel *)Kernel->Kernel);
+
+ if (Kernel)
+ free(Kernel);
+}
+
+static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
+ const char *KernelName) {
+ dump_function();
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
+ static __thread int NextCacheItem = 0;
+
+ for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
+ // We exploit here the property that all Polly-ACC kernels are allocated
+ // as global constants, hence a pointer comparision is sufficient to
+ // determin equality.
+ if (KernelCache[i] &&
+ ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
+ BinaryBuffer) {
+ debug_print(" -> using cached kernel\n");
+ return KernelCache[i];
+ }
+ }
+
+ PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+ if (Function == 0) {
+ fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
+ exit(-1);
+ }
+ Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
+ if (Function->Kernel == 0) {
+ fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
+ exit(-1);
+ }
+
+ if (!GlobalDeviceID) {
+ fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
+ exit(-1);
+ }
+
+ cl_int Ret;
+ size_t BinarySize = strlen(BinaryBuffer);
+ ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
+ ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
+ (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
+ &Ret);
+ checkOpenCLError(Ret, "Failed to create program from binary.\n");
+
+ Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
+ &GlobalDeviceID, NULL, NULL, NULL);
+ checkOpenCLError(Ret, "Failed to build program.\n");
+
+ ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
+ ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
+ checkOpenCLError(Ret, "Failed to create kernel.\n");
+
+ ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
+
+ if (CacheMode) {
+ if (KernelCache[NextCacheItem])
+ freeKernelCL(KernelCache[NextCacheItem]);
+
+ KernelCache[NextCacheItem] = Function;
+
+ NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
+ }
+
+ return Function;
+}
+
+static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
+ long MemSize) {
+ dump_function();
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ cl_int Ret;
+ Ret = clEnqueueWriteBufferFcnPtr(
+ ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
+ ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
+ HostData, 0, NULL, NULL);
+ checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
+}
+
+static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
+ long MemSize) {
+ dump_function();
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ cl_int Ret;
+ Ret = clEnqueueReadBufferFcnPtr(
+ ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
+ ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
+ HostData, 0, NULL, NULL);
+ checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
+}
+
+static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
+ unsigned int GridDimY, unsigned int BlockDimX,
+ unsigned int BlockDimY, unsigned int BlockDimZ,
+ void **Parameters) {
+ dump_function();
+
+ cl_int Ret;
+ cl_uint NumArgs;
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
+ Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
+ sizeof(cl_uint), &NumArgs, NULL);
+ checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
+
+ // TODO: Pass the size of the kernel arguments in to launchKernelCL, along
+ // with the arguments themselves. This is a dirty workaround that can be
+ // broken.
+ for (cl_uint i = 0; i < NumArgs; i++) {
+ Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);
+ if (Ret == CL_INVALID_ARG_SIZE) {
+ Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
+ if (Ret == CL_INVALID_ARG_SIZE) {
+ Ret =
+ clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
+ if (Ret == CL_INVALID_ARG_SIZE) {
+ Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
+ (void *)Parameters[i]);
+ checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
+ }
+ }
+ }
+ if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
+ fprintf(stderr, "Failed to set Kernel argument.\n");
+ printOpenCLError(Ret);
+ exit(-1);
+ }
+ }
+
+ unsigned int GridDimZ = 1;
+ size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
+ BlockDimZ * GridDimZ};
+ size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
+
+ static const int WorkDim = 3;
+ OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
+ Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
+ WorkDim, NULL, GlobalWorkSize,
+ LocalWorkSize, 0, NULL, NULL);
+ checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
+}
+
+static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
+ dump_function();
+
+ OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
+ cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
+ checkOpenCLError(Ret, "Failed to free device memory.\n");
+
+ free(DevPtr);
+ free(Allocation);
+}
+
+static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
+ dump_function();
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
+ if (DevData == 0) {
+ fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+ exit(-1);
+ }
+ DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
+ if (DevData->DevicePtr == 0) {
+ fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+ exit(-1);
+ }
+
+ cl_int Ret;
+ ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
+ clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
+ CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
+ checkOpenCLError(Ret,
+ "Allocate memory for GPU device memory pointer failed.\n");
+
+ return DevData;
+}
+
+static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
+ dump_function();
+
+ OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
+ return (void *)DevPtr->MemObj;
+}
+
+static void synchronizeDeviceCL() {
+ dump_function();
+
+ if (!GlobalContext) {
+ fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+ exit(-1);
+ }
+
+ if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
+ CL_SUCCESS) {
+ fprintf(stderr, "Synchronizing device and host memory failed.\n");
+ exit(-1);
+ }
+}
+
+static void freeContextCL(PollyGPUContext *Context) {
+ dump_function();
+
+ cl_int Ret;
+
+ GlobalContext = NULL;
+
+ OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
+ if (Ctx->CommandQueue) {
+ Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
+ checkOpenCLError(Ret, "Could not release command queue.\n");
+ }
+
+ if (Ctx->Context) {
+ Ret = clReleaseContextFcnPtr(Ctx->Context);
+ checkOpenCLError(Ret, "Could not release context.\n");
+ }
+
+ free(Ctx);
+ free(Context);
+}
+
+static void printOpenCLError(int Error) {
+
+ switch (Error) {
+ case CL_SUCCESS:
+ // Success, don't print an error.
+ break;
+
+ // JIT/Runtime errors.
+ case CL_DEVICE_NOT_FOUND:
+ fprintf(stderr, "Device not found.\n");
+ break;
+ case CL_DEVICE_NOT_AVAILABLE:
+ fprintf(stderr, "Device not available.\n");
+ break;
+ case CL_COMPILER_NOT_AVAILABLE:
+ fprintf(stderr, "Compiler not available.\n");
+ break;
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+ fprintf(stderr, "Mem object allocation failure.\n");
+ break;
+ case CL_OUT_OF_RESOURCES:
+ fprintf(stderr, "Out of resources.\n");
+ break;
+ case CL_OUT_OF_HOST_MEMORY:
+ fprintf(stderr, "Out of host memory.\n");
+ break;
+ case CL_PROFILING_INFO_NOT_AVAILABLE:
+ fprintf(stderr, "Profiling info not available.\n");
+ break;
+ case CL_MEM_COPY_OVERLAP:
+ fprintf(stderr, "Mem copy overlap.\n");
+ break;
+ case CL_IMAGE_FORMAT_MISMATCH:
+ fprintf(stderr, "Image format mismatch.\n");
+ break;
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+ fprintf(stderr, "Image format not supported.\n");
+ break;
+ case CL_BUILD_PROGRAM_FAILURE:
+ fprintf(stderr, "Build program failure.\n");
+ break;
+ case CL_MAP_FAILURE:
+ fprintf(stderr, "Map failure.\n");
+ break;
+ case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+ fprintf(stderr, "Misaligned sub buffer offset.\n");
+ break;
+ case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+ fprintf(stderr, "Exec status error for events in wait list.\n");
+ break;
+ case CL_COMPILE_PROGRAM_FAILURE:
+ fprintf(stderr, "Compile program failure.\n");
+ break;
+ case CL_LINKER_NOT_AVAILABLE:
+ fprintf(stderr, "Linker not available.\n");
+ break;
+ case CL_LINK_PROGRAM_FAILURE:
+ fprintf(stderr, "Link program failure.\n");
+ break;
+ case CL_DEVICE_PARTITION_FAILED:
+ fprintf(stderr, "Device partition failed.\n");
+ break;
+ case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+ fprintf(stderr, "Kernel arg info not available.\n");
+ break;
+
+ // Compiler errors.
+ case CL_INVALID_VALUE:
+ fprintf(stderr, "Invalid value.\n");
+ break;
+ case CL_INVALID_DEVICE_TYPE:
+ fprintf(stderr, "Invalid device type.\n");
+ break;
+ case CL_INVALID_PLATFORM:
+ fprintf(stderr, "Invalid platform.\n");
+ break;
+ case CL_INVALID_DEVICE:
+ fprintf(stderr, "Invalid device.\n");
+ break;
+ case CL_INVALID_CONTEXT:
+ fprintf(stderr, "Invalid context.\n");
+ break;
+ case CL_INVALID_QUEUE_PROPERTIES:
+ fprintf(stderr, "Invalid queue properties.\n");
+ break;
+ case CL_INVALID_COMMAND_QUEUE:
+ fprintf(stderr, "Invalid command queue.\n");
+ break;
+ case CL_INVALID_HOST_PTR:
+ fprintf(stderr, "Invalid host pointer.\n");
+ break;
+ case CL_INVALID_MEM_OBJECT:
+ fprintf(stderr, "Invalid memory object.\n");
+ break;
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ fprintf(stderr, "Invalid image format descriptor.\n");
+ break;
+ case CL_INVALID_IMAGE_SIZE:
+ fprintf(stderr, "Invalid image size.\n");
+ break;
+ case CL_INVALID_SAMPLER:
+ fprintf(stderr, "Invalid sampler.\n");
+ break;
+ case CL_INVALID_BINARY:
+ fprintf(stderr, "Invalid binary.\n");
+ break;
+ case CL_INVALID_BUILD_OPTIONS:
+ fprintf(stderr, "Invalid build options.\n");
+ break;
+ case CL_INVALID_PROGRAM:
+ fprintf(stderr, "Invalid program.\n");
+ break;
+ case CL_INVALID_PROGRAM_EXECUTABLE:
+ fprintf(stderr, "Invalid program executable.\n");
+ break;
+ case CL_INVALID_KERNEL_NAME:
+ fprintf(stderr, "Invalid kernel name.\n");
+ break;
+ case CL_INVALID_KERNEL_DEFINITION:
+ fprintf(stderr, "Invalid kernel definition.\n");
+ break;
+ case CL_INVALID_KERNEL:
+ fprintf(stderr, "Invalid kernel.\n");
+ break;
+ case CL_INVALID_ARG_INDEX:
+ fprintf(stderr, "Invalid arg index.\n");
+ break;
+ case CL_INVALID_ARG_VALUE:
+ fprintf(stderr, "Invalid arg value.\n");
+ break;
+ case CL_INVALID_ARG_SIZE:
+ fprintf(stderr, "Invalid arg size.\n");
+ break;
+ case CL_INVALID_KERNEL_ARGS:
+ fprintf(stderr, "Invalid kernel args.\n");
+ break;
+ case CL_INVALID_WORK_DIMENSION:
+ fprintf(stderr, "Invalid work dimension.\n");
+ break;
+ case CL_INVALID_WORK_GROUP_SIZE:
+ fprintf(stderr, "Invalid work group size.\n");
+ break;
+ case CL_INVALID_WORK_ITEM_SIZE:
+ fprintf(stderr, "Invalid work item size.\n");
+ break;
+ case CL_INVALID_GLOBAL_OFFSET:
+ fprintf(stderr, "Invalid global offset.\n");
+ break;
+ case CL_INVALID_EVENT_WAIT_LIST:
+ fprintf(stderr, "Invalid event wait list.\n");
+ break;
+ case CL_INVALID_EVENT:
+ fprintf(stderr, "Invalid event.\n");
+ break;
+ case CL_INVALID_OPERATION:
+ fprintf(stderr, "Invalid operation.\n");
+ break;
+ case CL_INVALID_GL_OBJECT:
+ fprintf(stderr, "Invalid GL object.\n");
+ break;
+ case CL_INVALID_BUFFER_SIZE:
+ fprintf(stderr, "Invalid buffer size.\n");
+ break;
+ case CL_INVALID_MIP_LEVEL:
+ fprintf(stderr, "Invalid mip level.\n");
+ break;
+ case CL_INVALID_GLOBAL_WORK_SIZE:
+ fprintf(stderr, "Invalid global work size.\n");
+ break;
+ case CL_INVALID_PROPERTY:
+ fprintf(stderr, "Invalid property.\n");
+ break;
+ case CL_INVALID_IMAGE_DESCRIPTOR:
+ fprintf(stderr, "Invalid image descriptor.\n");
+ break;
+ case CL_INVALID_COMPILER_OPTIONS:
+ fprintf(stderr, "Invalid compiler options.\n");
+ break;
+ case CL_INVALID_LINKER_OPTIONS:
+ fprintf(stderr, "Invalid linker options.\n");
+ break;
+ case CL_INVALID_DEVICE_PARTITION_COUNT:
+ fprintf(stderr, "Invalid device partition count.\n");
+ break;
+ case CL_INVALID_PIPE_SIZE:
+ fprintf(stderr, "Invalid pipe size.\n");
+ break;
+ case CL_INVALID_DEVICE_QUEUE:
+ fprintf(stderr, "Invalid device queue.\n");
+ break;
+
+ // NVIDIA specific error.
+ case -9999:
+ fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
+ break;
+
+ default:
+ fprintf(stderr, "Unknown error code!\n");
+ break;
+ }
+}
+
+#endif /* HAS_LIBOPENCL */
+/******************************************************************************/
+/* CUDA */
+/******************************************************************************/
+#ifdef HAS_LIBCUDART
+
+struct CUDAContextT {
+ CUcontext Cuda;
+};
+
+struct CUDAKernelT {
CUfunction Cuda;
CUmodule CudaModule;
- const char *PTXString;
+ const char *BinaryString;
};
-struct PollyGPUDevicePtrT {
+struct CUDADevicePtrT {
CUdeviceptr Cuda;
};
@@ -57,10 +906,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
- CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
- unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
- unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
- void **kernelParams, void **extra);
+ CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
+ unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
+ unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
+ void **KernelParams, void **Extra);
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
@@ -95,8 +944,8 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
-typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
- const void *image);
+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
+ const void *Image);
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
@@ -109,25 +958,25 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
-typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
- CUjitInputType type, void *data,
- size_t size, const char *name,
- unsigned int numOptions,
- CUjit_option *options,
- void **optionValues);
+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
+ CUjitInputType Type, void *Data,
+ size_t Size, const char *Name,
+ unsigned int NumOptions,
+ CUjit_option *Options,
+ void **OptionValues);
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
-typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
- CUjit_option *options,
- void **optionValues,
- CUlinkState *stateOut);
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
+ CUjit_option *Options,
+ void **OptionValues,
+ CUlinkState *StateOut);
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
-typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
- size_t *sizeOut);
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
+ size_t *SizeOut);
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
-typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
@@ -137,36 +986,36 @@ static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
-static void *getAPIHandle(void *Handle, const char *FuncName) {
+static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
char *Err;
void *FuncPtr;
dlerror();
FuncPtr = dlsym(Handle, FuncName);
if ((Err = dlerror()) != 0) {
- fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
+ fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
return 0;
}
return FuncPtr;
}
-static int initialDeviceAPILibraries() {
+static int initialDeviceAPILibrariesCUDA() {
HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
if (!HandleCuda) {
- printf("Cannot open library: %s. \n", dlerror());
+ fprintf(stderr, "Cannot open library: %s. \n", dlerror());
return 0;
}
HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
if (!HandleCudaRT) {
- printf("Cannot open library: %s. \n", dlerror());
+ fprintf(stderr, "Cannot open library: %s. \n", dlerror());
return 0;
}
return 1;
}
-static int initialDeviceAPIs() {
- if (initialDeviceAPILibraries() == 0)
+static int initialDeviceAPIsCUDA() {
+ if (initialDeviceAPILibrariesCUDA() == 0)
return 0;
/* Get function pointer to CUDA Driver APIs.
@@ -178,77 +1027,76 @@ static int initialDeviceAPIs() {
* as it is valid on POSIX 2008.
*/
CuLaunchKernelFcnPtr =
- (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
+ (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
CuMemAllocFcnPtr =
- (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
+ (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
- CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");
+ CuMemFreeFcnPtr =
+ (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
CuMemcpyDtoHFcnPtr =
- (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
+ (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
CuMemcpyHtoDFcnPtr =
- (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
+ (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
CuModuleUnloadFcnPtr =
- (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
+ (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
CuCtxDestroyFcnPtr =
- (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
+ (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
- CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
+ CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
CuDeviceGetCountFcnPtr =
- (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");
+ (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
CuDeviceGetFcnPtr =
- (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
+ (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
CuCtxCreateFcnPtr =
- (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
+ (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
- CuModuleLoadDataExFcnPtr =
- (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
+ CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
+ HandleCuda, "cuModuleLoadDataEx");
CuModuleLoadDataFcnPtr =
- (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
+ (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
- CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
+ CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
HandleCuda, "cuModuleGetFunction");
CuDeviceComputeCapabilityFcnPtr =
- (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
+ (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
HandleCuda, "cuDeviceComputeCapability");
CuDeviceGetNameFcnPtr =
- (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
+ (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
CuLinkAddDataFcnPtr =
- (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
+ (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
CuLinkCreateFcnPtr =
- (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
+ (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
CuLinkCompleteFcnPtr =
- (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
+ (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
CuLinkDestroyFcnPtr =
- (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+ (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
CuCtxSynchronizeFcnPtr =
- (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
+ (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
/* Get function pointer to CUDA Runtime APIs. */
- CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
+ CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
HandleCudaRT, "cudaThreadSynchronize");
return 1;
}
-PollyGPUContext *polly_initContext() {
- DebugMode = getenv("POLLY_DEBUG") != 0;
-
+static PollyGPUContext *initContextCUDA() {
dump_function();
PollyGPUContext *Context;
CUdevice Device;
@@ -263,20 +1111,20 @@ PollyGPUContext *polly_initContext() {
return CurrentContext;
/* Get API handles. */
- if (initialDeviceAPIs() == 0) {
- fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
+ if (initialDeviceAPIsCUDA() == 0) {
+ fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
exit(-1);
}
if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
- fprintf(stdout, "Initializing the CUDA driver API failed.\n");
+ fprintf(stderr, "Initializing the CUDA driver API failed.\n");
exit(-1);
}
/* Get number of devices that supports CUDA. */
CuDeviceGetCountFcnPtr(&DeviceCount);
if (DeviceCount == 0) {
- fprintf(stdout, "There is no device supporting CUDA.\n");
+ fprintf(stderr, "There is no device supporting CUDA.\n");
exit(-1);
}
@@ -290,12 +1138,15 @@ PollyGPUContext *polly_initContext() {
/* Create context on the device. */
Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
if (Context == 0) {
- fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
+ fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
exit(-1);
}
- CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
-
- CacheMode = getenv("POLLY_NOCACHE") == 0;
+ Context->Context = malloc(sizeof(CUDAContext));
+ if (Context->Context == 0) {
+ fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
+ exit(-1);
+ }
+ CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
if (CacheMode)
CurrentContext = Context;
@@ -303,18 +1154,24 @@ PollyGPUContext *polly_initContext() {
return Context;
}
-static void freeKernel(PollyGPUFunction *Kernel) {
- if (Kernel->CudaModule)
- CuModuleUnloadFcnPtr(Kernel->CudaModule);
+static void freeKernelCUDA(PollyGPUFunction *Kernel) {
+ dump_function();
+
+ if (CacheMode)
+ return;
+
+ if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
+ CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
+
+ if (Kernel->Kernel)
+ free((CUDAKernel *)Kernel->Kernel);
if (Kernel)
free(Kernel);
}
-#define KERNEL_CACHE_SIZE 10
-
-PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
- const char *KernelName) {
+static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
+ const char *KernelName) {
dump_function();
static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
@@ -324,16 +1181,21 @@ PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
// We exploit here the property that all Polly-ACC kernels are allocated
// as global constants, hence a pointer comparision is sufficient to
// determin equality.
- if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
+ if (KernelCache[i] &&
+ ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
debug_print(" -> using cached kernel\n");
return KernelCache[i];
}
}
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
-
if (Function == 0) {
- fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
+ fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
+ exit(-1);
+ }
+ Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
+ if (Function->Kernel == 0) {
+ fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
exit(-1);
}
@@ -370,43 +1232,45 @@ PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
memset(ErrorLog, 0, sizeof(ErrorLog));
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
- Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
- strlen(PTXBuffer) + 1, 0, 0, 0, 0);
+ Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
+ strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
+ fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
exit(-1);
}
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "Complete ptx linker step failed.\n");
- fprintf(stdout, "\n%s\n", ErrorLog);
+ fprintf(stderr, "Complete ptx linker step failed.\n");
+ fprintf(stderr, "\n%s\n", ErrorLog);
exit(-1);
}
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
InfoLog);
- Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
+ Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
+ CuOut);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "Loading ptx assembly text failed.\n");
+ fprintf(stderr, "Loading ptx assembly text failed.\n");
exit(-1);
}
- Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
+ Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
+ ((CUDAKernel *)Function->Kernel)->CudaModule,
KernelName);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "Loading kernel function failed.\n");
+ fprintf(stderr, "Loading kernel function failed.\n");
exit(-1);
}
CuLinkDestroyFcnPtr(LState);
- Function->PTXString = PTXBuffer;
+ ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
if (CacheMode) {
if (KernelCache[NextCacheItem])
- freeKernel(KernelCache[NextCacheItem]);
+ freeKernelCUDA(KernelCache[NextCacheItem]);
KernelCache[NextCacheItem] = Function;
@@ -416,44 +1280,37 @@ PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
return Function;
}
-void polly_freeKernel(PollyGPUFunction *Kernel) {
+static void synchronizeDeviceCUDA() {
dump_function();
-
- if (CacheMode)
- return;
-
- freeKernel(Kernel);
+ if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+ fprintf(stderr, "Synchronizing device and host memory failed.\n");
+ exit(-1);
+ }
}
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
- long MemSize) {
+static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
+ long MemSize) {
dump_function();
- CUdeviceptr CuDevData = DevData->Cuda;
+ CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
}
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
- long MemSize) {
+static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
+ long MemSize) {
dump_function();
- if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
- fprintf(stdout, "Copying results from device to host memory failed.\n");
- exit(-1);
- }
-}
-void polly_synchronizeDevice() {
- dump_function();
- if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
- fprintf(stdout, "Synchronizing device and host memory failed.\n");
+ if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
+ MemSize) != CUDA_SUCCESS) {
+ fprintf(stderr, "Copying results from device to host memory failed.\n");
exit(-1);
}
}
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
- unsigned int GridDimY, unsigned int BlockDimX,
- unsigned int BlockDimY, unsigned int BlockDimZ,
- void **Parameters) {
+static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
+ unsigned int GridDimY, unsigned int BlockDimX,
+ unsigned int BlockDimY, unsigned int BlockDimZ,
+ void **Parameters) {
dump_function();
unsigned GridDimZ = 1;
@@ -462,45 +1319,290 @@ void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
void **Extra = 0;
CUresult Res;
- Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
- BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
- Stream, Parameters, Extra);
+ Res =
+ CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
+ GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
+ SharedMemBytes, Stream, Parameters, Extra);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "Launching CUDA kernel failed.\n");
+ fprintf(stderr, "Launching CUDA kernel failed.\n");
exit(-1);
}
}
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
+static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
dump_function();
- CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
+ CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
+ CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
+ free(DevPtr);
free(Allocation);
}
-PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
+static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
dump_function();
PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
-
if (DevData == 0) {
- fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
+ fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+ exit(-1);
+ }
+ DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
+ if (DevData->DevicePtr == 0) {
+ fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
exit(-1);
}
- CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
+ CUresult Res =
+ CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
if (Res != CUDA_SUCCESS) {
- fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
+ fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
exit(-1);
}
return DevData;
}
+static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
+ dump_function();
+
+ CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
+ return (void *)DevPtr->Cuda;
+}
+
+static void freeContextCUDA(PollyGPUContext *Context) {
+ dump_function();
+
+ CUDAContext *Ctx = (CUDAContext *)Context->Context;
+ if (Ctx->Cuda) {
+ CuCtxDestroyFcnPtr(Ctx->Cuda);
+ free(Ctx);
+ free(Context);
+ }
+
+ dlclose(HandleCuda);
+ dlclose(HandleCudaRT);
+}
+
+#endif /* HAS_LIBCUDART */
+/******************************************************************************/
+/* API */
+/******************************************************************************/
+
+PollyGPUContext *polly_initContext() {
+ DebugMode = getenv("POLLY_DEBUG") != 0;
+ CacheMode = getenv("POLLY_NOCACHE") == 0;
+
+ dump_function();
+
+ PollyGPUContext *Context;
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ Context = initContextCUDA();
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ Context = initContextCL();
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+
+ return Context;
+}
+
+void polly_freeKernel(PollyGPUFunction *Kernel) {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ freeKernelCUDA(Kernel);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ freeKernelCL(Kernel);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+}
+
+PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+ const char *KernelName) {
+ dump_function();
+
+ PollyGPUFunction *Function;
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ Function = getKernelCUDA(BinaryBuffer, KernelName);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ Function = getKernelCL(BinaryBuffer, KernelName);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+
+ return Function;
+}
+
+void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
+ long MemSize) {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ copyFromHostToDeviceCL(HostData, DevData, MemSize);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+}
+
+void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
+ long MemSize) {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ copyFromDeviceToHostCL(DevData, HostData, MemSize);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+}
+
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+ unsigned int GridDimY, unsigned int BlockDimX,
+ unsigned int BlockDimY, unsigned int BlockDimZ,
+ void **Parameters) {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
+ BlockDimZ, Parameters);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
+ Parameters);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+}
+
+void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ freeDeviceMemoryCUDA(Allocation);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ freeDeviceMemoryCL(Allocation);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+}
+
+PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
+ dump_function();
+
+ PollyGPUDevicePtr *DevData;
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ DevData = allocateMemoryForDeviceCUDA(MemSize);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ DevData = allocateMemoryForDeviceCL(MemSize);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+
+ return DevData;
+}
+
void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
dump_function();
- return (void *)Allocation->Cuda;
+ void *DevPtr;
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ DevPtr = getDevicePtrCUDA(Allocation);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ DevPtr = getDevicePtrCL(Allocation);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
+
+ return DevPtr;
+}
+
+void polly_synchronizeDevice() {
+ dump_function();
+
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ synchronizeDeviceCUDA();
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ synchronizeDeviceCL();
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
+ }
}
void polly_freeContext(PollyGPUContext *Context) {
@@ -509,11 +1611,40 @@ void polly_freeContext(PollyGPUContext *Context) {
if (CacheMode)
return;
- if (Context->Cuda) {
- CuCtxDestroyFcnPtr(Context->Cuda);
- free(Context);
+ switch (Runtime) {
+#ifdef HAS_LIBCUDART
+ case RUNTIME_CUDA:
+ freeContextCUDA(Context);
+ break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+ case RUNTIME_CL:
+ freeContextCL(Context);
+ break;
+#endif /* HAS_LIBOPENCL */
+ default:
+ err_runtime();
}
+}
- dlclose(HandleCuda);
- dlclose(HandleCudaRT);
+/* Initialize GPUJIT with CUDA as runtime library. */
+PollyGPUContext *polly_initContextCUDA() {
+#ifdef HAS_LIBCUDART
+ Runtime = RUNTIME_CUDA;
+ return polly_initContext();
+#else
+ fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
+ exit(-1);
+#endif /* HAS_LIBCUDART */
+}
+
+/* Initialize GPUJIT with OpenCL as runtime library. */
+PollyGPUContext *polly_initContextCL() {
+#ifdef HAS_LIBOPENCL
+ Runtime = RUNTIME_CL;
+ return polly_initContext();
+#else
+ fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
+ exit(-1);
+#endif /* HAS_LIBOPENCL */
}
diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h
index 1f886ec9a9f..f6de70b92f4 100644
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@@ -76,12 +76,27 @@
*
*/
+typedef enum PollyGPURuntimeT {
+ RUNTIME_NONE,
+ RUNTIME_CUDA,
+ RUNTIME_CL
+} PollyGPURuntime;
+
typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
-PollyGPUContext *polly_initContext();
-PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+typedef struct OpenCLContextT OpenCLContext;
+typedef struct OpenCLKernelT OpenCLKernel;
+typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
+
+typedef struct CUDAContextT CUDAContext;
+typedef struct CUDAKernelT CUDAKernel;
+typedef struct CUDADevicePtrT CUDADevicePtr;
+
+PollyGPUContext *polly_initContextCUDA();
+PollyGPUContext *polly_initContextCL();
+PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
const char *KernelName);
void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
OpenPOWER on IntegriCloud