summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiddharth Bhat <siddu.druid@gmail.com>2017-05-09 10:45:52 +0000
committerSiddharth Bhat <siddu.druid@gmail.com>2017-05-09 10:45:52 +0000
commita90be207c60cf3bcbcdf452f4443ba0b73d868ae (patch)
treeade3e615fc4891e9b72b19c9bad7eba8fa71e31e
parentcf9daa33a7870c235e0edc176dd40579f376cafc (diff)
downloadbcm5719-llvm-a90be207c60cf3bcbcdf452f4443ba0b73d868ae.tar.gz
bcm5719-llvm-a90be207c60cf3bcbcdf452f4443ba0b73d868ae.zip
[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen
Summary: PPCGCodeGeneration now attaches the size of the kernel launch parameters at the end of the parameter list. For the existing CUDA Runtime, this gets ignored, but the OpenCL Runtime knows to check for kernel-argument size at the end of the parameter list. (The resulting parameters list is twice as long. This has been accounted for in the corresponding test cases). Reviewers: grosser, Meinersbur, bollu Reviewed By: bollu Subscribers: nemanjai, yaxunl, Anastasia, pollydev, llvm-commits Tags: #polly Differential Revision: https://reviews.llvm.org/D32961 llvm-svn: 302515
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp71
-rw-r--r--polly/test/GPGPU/cuda-managed-memory-simple.ll18
-rw-r--r--polly/test/GPGPU/host-control-flow.ll2
-rw-r--r--polly/test/GPGPU/kernel-params-only-some-arrays.ll4
-rw-r--r--polly/test/GPGPU/parametric-loop-bound.ll2
-rw-r--r--polly/tools/GPURuntime/GPUJIT.c26
6 files changed, 76 insertions, 47 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 45e570c90b5..4b09faabac5 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -142,6 +142,14 @@ static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
return RefToExpr;
}
+/// Given a LLVM Type, compute its size in bytes,
+static int computeSizeInBytes(const Type *T) {
+ int bytes = T->getPrimitiveSizeInBits() / 8;
+ if (bytes == 0)
+ bytes = T->getScalarSizeInBits() / 8;
+ return bytes;
+}
+
/// Generate code for a GPU specific isl AST.
///
/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
@@ -272,6 +280,16 @@ private:
/// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
+ /// Store a specific kernel launch parameter in the array of kernel launch
+ /// parameters.
+ ///
+ /// @param Parameters The list of parameters in which to store.
+ /// @param Param The kernel launch parameter to store.
+ /// @param Index The index in the parameter list, at which to store the
+ /// parameter.
+ void insertStoreParameter(Instruction *Parameters, Instruction *Param,
+ int Index);
+
/// Create kernel launch parameters.
///
/// @param Kernel The kernel to create parameters for.
@@ -1192,11 +1210,21 @@ GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
}
+void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters,
+ Instruction *Param, int Index) {
+ Value *Slot = Builder.CreateGEP(
+ Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
+ Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
+ Builder.CreateStore(ParamTyped, Slot);
+}
+
Value *
GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
SetVector<Value *> SubtreeValues) {
- Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(),
- std::distance(F->arg_begin(), F->arg_end()));
+ const int NumArgs = F->arg_size();
+ std::vector<int> ArgSizes(NumArgs);
+
+ Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs);
BasicBlock *EntryBlock =
&Builder.GetInsertBlock()->getParent()->getEntryBlock();
@@ -1213,6 +1241,8 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
+ ArgSizes[Index] = SAI->getElemSizeInBytes();
+
Value *DevArray = nullptr;
if (ManagedMemory) {
DevArray = getOrCreateManagedDeviceArray(
@@ -1265,16 +1295,15 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
Value *Val = IDToValue[Id];
isl_id_free(Id);
+
+ ArgSizes[Index] = computeSizeInBytes(Val->getType());
+
Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,
Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);
- Value *Slot = Builder.CreateGEP(
- Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
- Value *ParamTyped =
- Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
- Builder.CreateStore(ParamTyped, Slot);
+ insertStoreParameter(Parameters, Param, Index);
Index++;
}
@@ -1284,30 +1313,38 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
Value *Val = IDToValue[Id];
isl_id_free(Id);
+
+ ArgSizes[Index] = computeSizeInBytes(Val->getType());
+
Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,
Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);
- Value *Slot = Builder.CreateGEP(
- Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
- Value *ParamTyped =
- Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
- Builder.CreateStore(ParamTyped, Slot);
+ insertStoreParameter(Parameters, Param, Index);
Index++;
}
for (auto Val : SubtreeValues) {
+ ArgSizes[Index] = computeSizeInBytes(Val->getType());
+
Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,
Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);
- Value *Slot = Builder.CreateGEP(
- Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
- Value *ParamTyped =
- Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
- Builder.CreateStore(ParamTyped, Slot);
+ insertStoreParameter(Parameters, Param, Index);
+ Index++;
+ }
+
+ for (int i = 0; i < NumArgs; i++) {
+ Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
+ Instruction *Param =
+ new AllocaInst(Builder.getInt32Ty(), AddressSpace,
+ Launch + "_param_size_" + std::to_string(i),
+ EntryBlock->getTerminator());
+ Builder.CreateStore(Val, Param);
+ insertStoreParameter(Parameters, Param, Index);
Index++;
}
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll
index 4a97ec56ad5..a8a1d6ae9d2 100644
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@@ -37,18 +37,26 @@
; CHECK: %13 = call i8* @polly_initContextCUDA()
; CHECK-NEXT: %14 = bitcast i32* %A to i8*
-; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT: %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0
; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8*
; CHECK-NEXT: store i8* %16, i8** %15
; CHECK-NEXT: %17 = bitcast i32* %R to i8*
-; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1
; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
; CHECK-NEXT: store i8* %19, i8** %18
-; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
-; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
-; CHECK-NEXT: call void @polly_freeKernel(i8* %20)
+; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0
+; CHECK-NEXT: %20 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 2
+; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8*
+; CHECK-NEXT: store i8* %21, i8** %20
+; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1
+; CHECK-NEXT: %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3
+; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8*
+; CHECK-NEXT: store i8* %23, i8** %22
+; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; CHECK-NEXT: call void @polly_freeKernel(i8* %24)
; CHECK-NEXT: call void @polly_synchronizeDevice()
; CHECK-NEXT: call void @polly_freeContext(i8* %13)
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
index 9e940aade1f..bc66dc0d787 100644
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@@ -32,7 +32,7 @@
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
; ...
; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1
-; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1
; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
; IR: call i8* @polly_getKernel
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
index 193de957e5e..b6f3172abc6 100644
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
@@ -48,13 +48,13 @@
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
-; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
-; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0
+; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll
index 687658efcf0..1ca5151181a 100644
--- a/polly/test/GPGPU/parametric-loop-bound.ll
+++ b/polly/test/GPGPU/parametric-loop-bound.ll
@@ -31,7 +31,7 @@
; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0);
; IR: store i64 %n, i64* %polly_launch_0_param_1
-; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1
; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index 5a0077579dd..02dba033537 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -554,28 +554,12 @@ static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
sizeof(cl_uint), &NumArgs, NULL);
checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
- // TODO: Pass the size of the kernel arguments in to launchKernelCL, along
- // with the arguments themselves. This is a dirty workaround that can be
- // broken.
+ /* Argument sizes are stored at the end of the Parameters array. */
for (cl_uint i = 0; i < NumArgs; i++) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret =
- clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
- (void *)Parameters[i]);
- checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
- }
- }
- }
- if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
- fprintf(stderr, "Failed to set Kernel argument.\n");
- printOpenCLError(Ret);
- exit(-1);
- }
+ Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
+ *((int *)Parameters[NumArgs + i]),
+ (void *)Parameters[i]);
+ checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
}
unsigned int GridDimZ = 1;
OpenPOWER on IntegriCloud