summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--polly/include/polly/CodeGen/PPCGCodeGeneration.h2
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp169
-rw-r--r--polly/lib/Support/RegisterPasses.cpp6
-rw-r--r--polly/test/GPGPU/spir-codegen.ll118
-rw-r--r--polly/tools/GPURuntime/GPUJIT.c101
5 files changed, 353 insertions, 43 deletions
diff --git a/polly/include/polly/CodeGen/PPCGCodeGeneration.h b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
index b498326dedf..a1f2fc81572 100644
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@@ -16,7 +16,7 @@
#define POLLY_PPCGCODEGENERATION_H
/// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
+enum GPUArch { NVPTX64, SPIR32, SPIR64 };
/// The GPU Runtime implementation to use.
enum GPURuntime { CUDA, OpenCL };
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 8935aa172f3..ec488488179 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -545,6 +545,11 @@ private:
/// @param The kernel to generate the intrinsic functions for.
void insertKernelIntrinsics(ppcg_kernel *Kernel);
+ /// Insert function calls to retrieve the SPIR group/local ids.
+ ///
+ /// @param The kernel to generate the function calls for.
+ void insertKernelCallsSPIR(ppcg_kernel *Kernel);
+
/// Setup the creation of functions referenced by the GPU kernel.
///
/// 1. Create new function declarations in GPUModule which are the same as
@@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
void GPUNodeBuilder::createKernelSync() {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ const char *SpirName = "__gen_ocl_barrier_global";
Function *Sync;
switch (Arch) {
+ case GPUArch::SPIR64:
+ case GPUArch::SPIR32:
+ Sync = M->getFunction(SpirName);
+
+ // If Sync is not available, declare it.
+ if (!Sync) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ std::vector<Type *> Args;
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+ Sync = Function::Create(Ty, Linkage, SpirName, M);
+ Sync->setCallingConv(CallingConv::SPIR_FUNC);
+ }
+ break;
case GPUArch::NVPTX64:
Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
break;
@@ -1668,7 +1687,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
finalizeKernelArguments(Kernel);
Function *F = Builder.GetInsertBlock()->getParent();
- addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
+ if (Arch == GPUArch::NVPTX64)
+ addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
clearDominators(F);
clearScalarEvolution(F);
clearLoops(F);
@@ -1725,12 +1745,35 @@ static std::string computeNVPTXDataLayout(bool is64Bit) {
return Ret;
}
+/// Compute the DataLayout string for a SPIR kernel.
+///
+/// @param is64Bit Are we looking for a 64 bit architecture?
+static std::string computeSPIRDataLayout(bool is64Bit) {
+ std::string Ret = "";
+
+ if (!is64Bit) {
+ Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+ "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+ "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+ "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+ } else {
+ Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+ "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+ "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+ "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+ }
+
+ return Ret;
+}
+
Function *
GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues) {
std::vector<Type *> Args;
std::string Identifier = getKernelFuncName(Kernel->id);
+ std::vector<Metadata *> MemoryType;
+
for (long i = 0; i < Prog->n_array; i++) {
if (!ppcg_kernel_requires_array_argument(Kernel, i))
continue;
@@ -1739,16 +1782,23 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
Args.push_back(SAI->getElementType());
+ MemoryType.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
} else {
static const int UseGlobalMemory = 1;
Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
+ MemoryType.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
}
}
int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
- for (long i = 0; i < NumHostIters; i++)
+ for (long i = 0; i < NumHostIters; i++) {
Args.push_back(Builder.getInt64Ty());
+ MemoryType.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+ }
int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
@@ -1757,19 +1807,49 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
Value *Val = IDToValue[Id];
isl_id_free(Id);
Args.push_back(Val->getType());
+ MemoryType.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
}
- for (auto *V : SubtreeValues)
+ for (auto *V : SubtreeValues) {
Args.push_back(V->getType());
+ MemoryType.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+ }
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
GPUModule.get());
+ std::vector<Metadata *> EmptyStrings;
+
+ for (unsigned int i = 0; i < MemoryType.size(); i++) {
+ EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
+ }
+
+ if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
+ FN->setMetadata("kernel_arg_addr_space",
+ MDNode::get(FN->getContext(), MemoryType));
+ FN->setMetadata("kernel_arg_name",
+ MDNode::get(FN->getContext(), EmptyStrings));
+ FN->setMetadata("kernel_arg_access_qual",
+ MDNode::get(FN->getContext(), EmptyStrings));
+ FN->setMetadata("kernel_arg_type",
+ MDNode::get(FN->getContext(), EmptyStrings));
+ FN->setMetadata("kernel_arg_type_qual",
+ MDNode::get(FN->getContext(), EmptyStrings));
+ FN->setMetadata("kernel_arg_base_type",
+ MDNode::get(FN->getContext(), EmptyStrings));
+ }
+
switch (Arch) {
case GPUArch::NVPTX64:
FN->setCallingConv(CallingConv::PTX_Kernel);
break;
+ case GPUArch::SPIR32:
+ case GPUArch::SPIR64:
+ FN->setCallingConv(CallingConv::SPIR_KERNEL);
+ break;
}
auto Arg = FN->arg_begin();
@@ -1835,6 +1915,9 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
Intrinsic::ID IntrinsicsTID[3];
switch (Arch) {
+ case GPUArch::SPIR64:
+ case GPUArch::SPIR32:
+ llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
case GPUArch::NVPTX64:
IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
@@ -1866,6 +1949,41 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
}
}
+void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) {
+ const char *GroupName[3] = {"__gen_ocl_get_group_id0",
+ "__gen_ocl_get_group_id1",
+ "__gen_ocl_get_group_id2"};
+
+ const char *LocalName[3] = {"__gen_ocl_get_local_id0",
+ "__gen_ocl_get_local_id1",
+ "__gen_ocl_get_local_id2"};
+
+ auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *FN = M->getFunction(Name);
+
+ // If FN is not available, declare it.
+ if (!FN) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ std::vector<Type *> Args;
+ FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false);
+ FN = Function::Create(Ty, Linkage, Name, M);
+ FN->setCallingConv(CallingConv::SPIR_FUNC);
+ }
+
+ Value *Val = Builder.CreateCall(FN, {});
+ Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
+ IDToValue[Id] = Val;
+ KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
+ };
+
+ for (int i = 0; i < Kernel->n_grid; ++i)
+ createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i));
+
+ for (int i = 0; i < Kernel->n_block; ++i)
+ createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i));
+}
+
void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
auto Arg = FN->arg_begin();
for (long i = 0; i < Kernel->n_array; i++) {
@@ -2004,6 +2122,14 @@ void GPUNodeBuilder::createKernelFunction(
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
break;
+ case GPUArch::SPIR32:
+ GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
+ GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
+ break;
+ case GPUArch::SPIR64:
+ GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
+ GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
+ break;
}
Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@@ -2021,7 +2147,16 @@ void GPUNodeBuilder::createKernelFunction(
prepareKernelArguments(Kernel, FN);
createKernelVariables(Kernel, FN);
- insertKernelIntrinsics(Kernel);
+
+ switch (Arch) {
+ case GPUArch::NVPTX64:
+ insertKernelIntrinsics(Kernel);
+ break;
+ case GPUArch::SPIR32:
+ case GPUArch::SPIR64:
+ insertKernelCallsSPIR(Kernel);
+ break;
+ }
}
std::string GPUNodeBuilder::createKernelASM() {
@@ -2038,6 +2173,13 @@ std::string GPUNodeBuilder::createKernelASM() {
break;
}
break;
+ case GPUArch::SPIR64:
+ case GPUArch::SPIR32:
+ std::string SPIRAssembly;
+ raw_string_ostream IROstream(SPIRAssembly);
+ IROstream << *GPUModule;
+ IROstream.flush();
+ return SPIRAssembly;
}
std::string ErrMsg;
@@ -2057,6 +2199,9 @@ std::string GPUNodeBuilder::createKernelASM() {
case GPUArch::NVPTX64:
subtarget = CudaVersion;
break;
+ case GPUArch::SPIR32:
+ case GPUArch::SPIR64:
+ llvm_unreachable("No subtarget for SPIR architecture");
}
std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
@@ -2097,13 +2242,15 @@ std::string GPUNodeBuilder::finalizeKernelFunction() {
if (DumpKernelIR)
outs() << *GPUModule << "\n";
- // Optimize module.
- llvm::legacy::PassManager OptPasses;
- PassManagerBuilder PassBuilder;
- PassBuilder.OptLevel = 3;
- PassBuilder.SizeLevel = 0;
- PassBuilder.populateModulePassManager(OptPasses);
- OptPasses.run(*GPUModule);
+ if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
+ // Optimize module.
+ llvm::legacy::PassManager OptPasses;
+ PassManagerBuilder PassBuilder;
+ PassBuilder.OptLevel = 3;
+ PassBuilder.SizeLevel = 0;
+ PassBuilder.populateModulePassManager(OptPasses);
+ OptPasses.run(*GPUModule);
+ }
std::string Assembly = createKernelASM();
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 4e4c93c51d8..bd01a269d24 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -117,7 +117,11 @@ static cl::opt<GPURuntime> GPURuntimeChoice(
static cl::opt<GPUArch>
GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
- "target NVIDIA 64-bit architecture")),
+ "target NVIDIA 64-bit architecture"),
+ clEnumValN(GPUArch::SPIR32, "spir32",
+ "target SPIR 32-bit architecture"),
+ clEnumValN(GPUArch::SPIR64, "spir64",
+ "target SPIR 64-bit architecture")),
cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
cl::cat(PollyCategory));
#endif
diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll
new file mode 100644
index 00000000000..aae4adc489f
--- /dev/null
+++ b/polly/test/GPGPU/spir-codegen.ll
@@ -0,0 +1,118 @@
+; RUN: opt -O3 -polly -polly-target=gpu \
+; RUN: -polly-gpu-arch=spir32 \
+; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+; CHECK: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+; CHECK-NEXT: target triple = "spir-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef0) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %0 = call i32 @__gen_ocl_get_group_id0()
+; CHECK-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64
+; CHECK-NEXT: %1 = call i32 @__gen_ocl_get_group_id1()
+; CHECK-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64
+; CHECK-NEXT: %2 = call i32 @__gen_ocl_get_local_id0()
+; CHECK-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64
+; CHECK-NEXT: %3 = call i32 @__gen_ocl_get_local_id1()
+; CHECK-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64
+; CHECK-NEXT: br label %polly.loop_preheader
+
+; CHECK-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5
+; CHECK-NEXT: ret void
+
+; CHECK-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; CHECK-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; CHECK-NEXT: %4 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT: %5 = add nsw i64 %4, %__gen_ocl_get_local_id0
+; CHECK-NEXT: %6 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT: %7 = add nsw i64 %6, %__gen_ocl_get_local_id1
+; CHECK-NEXT: %8 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT: %9 = add nsw i64 %7, %8
+; CHECK-NEXT: br label %polly.stmt.bb5
+
+; CHECK-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header
+; CHECK-NEXT: %10 = mul i64 %5, %9
+; CHECK-NEXT: %p_tmp6 = sitofp i64 %10 to float
+; CHECK-NEXT: %polly.access.cast.MemRef0 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT: %11 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT: %12 = add nsw i64 %11, %__gen_ocl_get_local_id0
+; CHECK-NEXT: %polly.access.mul.MemRef0 = mul nsw i64 %12, 1024
+; CHECK-NEXT: %13 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT: %14 = add nsw i64 %13, %__gen_ocl_get_local_id1
+; CHECK-NEXT: %15 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT: %16 = add nsw i64 %14, %15
+; CHECK-NEXT: %polly.access.add.MemRef0 = add nsw i64 %polly.access.mul.MemRef0, %16
+; CHECK-NEXT: %polly.access.MemRef0 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef0, i64 %polly.access.add.MemRef0
+; CHECK-NEXT: %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef0, align 4
+; CHECK-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; CHECK-NEXT: %polly.access.cast.MemRef01 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT: %17 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT: %18 = add nsw i64 %17, %__gen_ocl_get_local_id0
+; CHECK-NEXT: %polly.access.mul.MemRef02 = mul nsw i64 %18, 1024
+; CHECK-NEXT: %19 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT: %20 = add nsw i64 %19, %__gen_ocl_get_local_id1
+; CHECK-NEXT: %21 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT: %22 = add nsw i64 %20, %21
+; CHECK-NEXT: %polly.access.add.MemRef03 = add nsw i64 %polly.access.mul.MemRef02, %22
+; CHECK-NEXT: %polly.access.MemRef04 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef01, i64 %polly.access.add.MemRef03
+; CHECK-NEXT: store float %p_tmp9, float addrspace(1)* %polly.access.MemRef04, align 4
+; CHECK-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; CHECK-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1
+; CHECK-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+; CHECK-LABEL: polly.loop_preheader: ; preds = %entry
+; CHECK-NEXT: br label %polly.loop_header
+
+; CHECK: attributes #0 = { "polly.skip.fn" }
+
+; void double_parallel_loop(float A[][1024]) {
+; for (long i = 0; i < 1024; i++)
+; for (long j = 0; j < 1024; j++)
+; A[i][j] += i * j;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @double_parallel_loop([1024 x float]* %A) {
+bb:
+ br label %bb2
+
+bb2: ; preds = %bb13, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
+ %exitcond1 = icmp ne i64 %i.0, 1024
+ br i1 %exitcond1, label %bb3, label %bb15
+
+bb3: ; preds = %bb2
+ br label %bb4
+
+bb4: ; preds = %bb10, %bb3
+ %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
+ %exitcond = icmp ne i64 %j.0, 1024
+ br i1 %exitcond, label %bb5, label %bb12
+
+bb5: ; preds = %bb4
+ %tmp = mul nuw nsw i64 %i.0, %j.0
+ %tmp6 = sitofp i64 %tmp to float
+ %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
+ %tmp8 = load float, float* %tmp7, align 4
+ %tmp9 = fadd float %tmp8, %tmp6
+ store float %tmp9, float* %tmp7, align 4
+ br label %bb10
+
+bb10: ; preds = %bb5
+ %tmp11 = add nuw nsw i64 %j.0, 1
+ br label %bb4
+
+bb12: ; preds = %bb4
+ br label %bb13
+
+bb13: ; preds = %bb12
+ %tmp14 = add nuw nsw i64 %i.0, 1
+ br label %bb2
+
+bb15: ; preds = %bb2
+ ret void
+}
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index 99a726e0214..02a91a7913d 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -23,13 +23,14 @@
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
-#endif
+#endif /* __APPLE__ */
#endif /* HAS_LIBOPENCL */
#include <dlfcn.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
+#include <unistd.h>
static int DebugMode;
static int CacheMode;
@@ -89,6 +90,7 @@ struct OpenCLDevicePtrT {
/* Dynamic library handles for the OpenCL runtime library. */
static void *HandleOpenCL;
+static void *HandleOpenCLBeignet;
/* Type-defines of function pointer to OpenCL Runtime API. */
typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
@@ -139,6 +141,12 @@ clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
const cl_event *EventWaitList, cl_event *Event);
static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
+typedef cl_program
+clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
+ const cl_device_id *DeviceList,
+ const char *Filename, cl_int *ErrcodeRet);
+static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
+
typedef cl_program clCreateProgramWithBinaryFcnTy(
cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
@@ -210,6 +218,7 @@ static void *getAPIHandleCL(void *Handle, const char *FuncName) {
}
static int initialDeviceAPILibrariesCL() {
+ HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
if (!HandleOpenCL) {
fprintf(stderr, "Cannot open library: %s. \n", dlerror());
@@ -237,67 +246,79 @@ static int initialDeviceAPIsCL() {
if (initialDeviceAPILibrariesCL() == 0)
return 0;
+ // FIXME: We are now always selecting the Intel Beignet driver if it is
+ // available on the system, instead of a possible NVIDIA or AMD OpenCL
+ // API. This selection should occurr based on the target architecture
+ // chosen when compiling.
+ void *Handle =
+ (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
+
clGetPlatformIDsFcnPtr =
- (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
+ (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
clGetDeviceIDsFcnPtr =
- (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
+ (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
clGetDeviceInfoFcnPtr =
- (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
+ (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
clGetKernelInfoFcnPtr =
- (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
+ (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
clCreateContextFcnPtr =
- (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
+ (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clCreateCommandQueue");
+ Handle, "clCreateCommandQueue");
clCreateBufferFcnPtr =
- (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
+ (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueWriteBuffer");
+ Handle, "clEnqueueWriteBuffer");
+
+ if (HandleOpenCLBeignet)
+ clCreateProgramWithLLVMIntelFcnPtr =
+ (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
+ Handle, "clCreateProgramWithLLVMIntel");
clCreateProgramWithBinaryFcnPtr =
(clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clCreateProgramWithBinary");
+ Handle, "clCreateProgramWithBinary");
clBuildProgramFcnPtr =
- (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
+ (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
clCreateKernelFcnPtr =
- (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
+ (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
clSetKernelArgFcnPtr =
- (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
+ (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueNDRangeKernel");
+ Handle, "clEnqueueNDRangeKernel");
- clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueReadBuffer");
+ clEnqueueReadBufferFcnPtr =
+ (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
- clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
+ clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
- clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
+ clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
clReleaseKernelFcnPtr =
- (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
+ (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
clReleaseProgramFcnPtr =
- (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
+ (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
- clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clReleaseMemObject");
+ clReleaseMemObjectFcnPtr =
+ (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clReleaseCommandQueue");
+ Handle, "clReleaseCommandQueue");
clReleaseContextFcnPtr =
- (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
+ (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
return 1;
}
@@ -481,12 +502,32 @@ static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
}
cl_int Ret;
- size_t BinarySize = strlen(BinaryBuffer);
- ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
- ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
- (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
- &Ret);
- checkOpenCLError(Ret, "Failed to create program from binary.\n");
+
+ if (HandleOpenCLBeignet) {
+ // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
+ // accepts a filename to a valid llvm-ir file as an argument, instead
+ // of accepting the BinaryBuffer directly.
+ FILE *fp = fopen("kernel.ll", "wb");
+ if (fp != NULL) {
+ fputs(BinaryBuffer, fp);
+ fclose(fp);
+ }
+
+ ((OpenCLKernel *)Function->Kernel)->Program =
+ clCreateProgramWithLLVMIntelFcnPtr(
+ ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+ &GlobalDeviceID, "kernel.ll", &Ret);
+ checkOpenCLError(Ret, "Failed to create program from llvm.\n");
+ unlink("kernel.ll");
+ } else {
+ size_t BinarySize = strlen(BinaryBuffer);
+ ((OpenCLKernel *)Function->Kernel)->Program =
+ clCreateProgramWithBinaryFcnPtr(
+ ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+ &GlobalDeviceID, (const size_t *)&BinarySize,
+ (const unsigned char **)&BinaryBuffer, NULL, &Ret);
+ checkOpenCLError(Ret, "Failed to create program from binary.\n");
+ }
Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
&GlobalDeviceID, NULL, NULL, NULL);
OpenPOWER on IntegriCloud