diff options
| author | Tobias Grosser <tobias@grosser.es> | 2016-07-27 13:20:16 +0000 |
|---|---|---|
| committer | Tobias Grosser <tobias@grosser.es> | 2016-07-27 13:20:16 +0000 |
| commit | 79a947c2336c9cf12a588844d39422c9aa0a5658 (patch) | |
| tree | 0cb6a29681fa7ac56c6258803047171cdbaa02c4 | |
| parent | 499375ceaaa1f7071dd6a9d90f6a3e6fd237d70e (diff) | |
| download | bcm5719-llvm-79a947c2336c9cf12a588844d39422c9aa0a5658.tar.gz bcm5719-llvm-79a947c2336c9cf12a588844d39422c9aa0a5658.zip | |
GPGPU: Add basic support for kernel launches
llvm-svn: 276863
| -rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 171 | ||||
| -rw-r--r-- | polly/test/GPGPU/double-parallel-loop.ll | 6 | ||||
| -rw-r--r-- | polly/test/GPGPU/host-control-flow.ll | 6 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.c | 67 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.h | 28 |
5 files changed, 226 insertions, 52 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index a5d55ea7ac5..d0ae82aec3c 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -204,6 +204,29 @@ private: /// @returns A set of values referenced by the kernel. SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); + /// Compute the sizes of the execution grid for a given kernel. + /// + /// @param Kernel The kernel to compute grid sizes for. + /// + /// @returns A tuple with grid sizes for X and Y dimension + std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); + + /// Compute the sizes of the thread blocks for a given kernel. + /// + /// @param Kernel The kernel to compute thread block sizes for. + /// + /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. + std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); + + /// Create kernel launch parameters. + /// + /// @param Kernel The kernel to create parameters for. + /// @param F The kernel function that has been created. + /// + /// @returns A stack allocated array with pointers to the parameter + /// values that are passed to the kernel. + Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F); + /// Create GPU kernel. /// /// Code generate the kernel described by @p KernelStmt. @@ -296,6 +319,13 @@ private: /// @returns A pointer to the newly initialized context. Value *createCallInitContext(); + /// Create a call to get the device pointer for a kernel allocation. + /// + /// @param Allocation The Polly GPU allocation + /// + /// @returns The device parameter corresponding to this allocation. + Value *createCallGetDevicePtr(Value *Allocation); + /// Create a call to free the GPU context. /// /// @param Context A pointer to an initialized GPU context. @@ -339,6 +369,21 @@ private: /// /// @param GPUKernel THe kernel to free. void createCallFreeKernel(Value *GPUKernel); + + /// Create a call to launch a GPU kernel. + /// + /// @param GPUKernel The kernel to launch. + /// @param GridDimX The size of the first grid dimension. + /// @param GridDimY The size of the second grid dimension. + /// @param GridBlockX The size of the first block dimension. + /// @param GridBlockY The size of the second block dimension. + /// @param GridBlockZ The size of the third block dimension. + /// @param Paramters A pointer to an array that contains itself pointers to + /// the parameter values passed for each kernel argument. + void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, + Value *GridDimY, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ, + Value *Parameters); }; void GPUNodeBuilder::initializeAfterRTH() { @@ -393,6 +438,50 @@ Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { return Builder.CreateCall(F, {Buffer, Entry}); } +Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { + const char *Name = "polly_getDevicePtr"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type *> Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {Allocation}); +} + +void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, + Value *GridDimY, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ, + Value *Parameters) { + const char *Name = "polly_launchKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type *> Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt32Ty()); + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters}); +} + void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { const char *Name = "polly_freeKernel"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -755,6 +844,77 @@ void GPUNodeBuilder::clearLoops(Function *F) { } } +std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { + std::vector<Value *> Sizes; + isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); + + for (long i = 0; i < Kernel->n_grid; i++) { + isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); + isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); + Value *Res = ExprBuilder.create(GridSize); + Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); + Sizes.push_back(Res); + } + isl_ast_build_free(Context); + + for (long i = Kernel->n_grid; i < 3; i++) + Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); + + return std::make_tuple(Sizes[0], Sizes[1]); +} + +std::tuple<Value *, Value *, Value *> +GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { + std::vector<Value *> Sizes; + + for (long i = 0; i < Kernel->n_block; i++) { + Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); + Sizes.push_back(Res); + } + + for (long i = Kernel->n_block; i < 3; i++) + Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); + + return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); +} + +Value *GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, + Function *F) { + Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), F->getNumOperands()); + + BasicBlock *EntryBlock = + &Builder.GetInsertBlock()->getParent()->getEntryBlock(); + std::string Launch = "polly_launch_" + std::to_string(Kernel->id); + Instruction *Parameters = + new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); + + int Index = 0; + for (long i = 0; i < Prog->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + + Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; + DevArray = createCallGetDevicePtr(DevArray); + Instruction *Param = new AllocaInst( + Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(DevArray, Param); + Value *Slot = Builder.CreateGEP(Parameters, + {Builder.getInt64(0), Builder.getInt64(i)}); + Value *ParamTyped = + Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); + Index++; + } + + auto Location = EntryBlock->getTerminator(); + return new BitCastInst(Parameters, Builder.getInt8PtrTy(), + Launch + "_params_i8ptr", Location); +} + void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id *Id = isl_ast_node_get_annotation(KernelStmt); ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); @@ -805,11 +965,22 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); LocalArrays.clear(); + Value *Parameters = createLaunchParameters(Kernel, F); + std::string ASMString = finalizeKernelFunction(); std::string Name = "kernel_" + std::to_string(Kernel->id); Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); Value *GPUKernel = createCallGetKernel(KernelString, NameString); + + Value *GridDimX, *GridDimY; + std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); + + Value *BlockDimX, *BlockDimY, *BlockDimZ; + std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); + + createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); createCallFreeKernel(GPUKernel); } diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index 83d6d5e522a..5306620811c 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -96,7 +96,13 @@ ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304) ; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304) +; IR-NEXT: [[DevPtr:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0 +; IR-NEXT: [[ParamSlot:%.*]] = getelementptr [0 x i8*], [0 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* +; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]] ; IR-NEXT: call i8* @polly_getKernel +; IR-NEXT: call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr) ; IR-NEXT: call void @polly_freeKernel ; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304) diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll index 29ef71bdb23..7f41a11bb0a 100644 --- a/polly/test/GPGPU/host-control-flow.ll +++ b/polly/test/GPGPU/host-control-flow.ll @@ -30,8 +30,10 @@ ; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] -; IR-NEXT: call i8* @polly_getKernel -; IR-NEXT: call void @polly_freeKernel +; ... +; IR: call i8* @polly_getKernel +; ... +; IR: call void @polly_freeKernel ; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 ; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98 ; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index d2b516a9efa..24320f88418 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -54,18 +54,12 @@ static void *HandleCudaRT; typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); static CuMemAllocFcnTy *CuMemAllocFcnPtr; -typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int); -static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr; - -typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *, - unsigned int); -static CuParamSetvFcnTy *CuParamSetvFcnPtr; - -typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int); -static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr; - -typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int); -static CuLaunchGridFcnTy *CuLaunchGridFcnPtr; +typedef CUresult CUDAAPI CuLaunchKernelFcnTy( + CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, + unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, + void **kernelParams, void **extra); +static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; @@ -178,17 +172,8 @@ static int initialDeviceAPIs() { * of this kind of cast may not be emitted by clang and new versions of gcc * as it is valid on POSIX 2008. */ - CuFuncSetBlockShapeFcnPtr = (CuFuncSetBlockShapeFcnTy *)getAPIHandle( - HandleCuda, "cuFuncSetBlockShape"); - - CuParamSetvFcnPtr = - (CuParamSetvFcnTy *)getAPIHandle(HandleCuda, "cuParamSetv"); - - CuParamSetSizeFcnPtr = - (CuParamSetSizeFcnTy *)getAPIHandle(HandleCuda, "cuParamSetSize"); - - CuLaunchGridFcnPtr = - (CuLaunchGridFcnTy *)getAPIHandle(HandleCuda, "cuLaunchGrid"); + CuLaunchKernelFcnPtr = + (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); @@ -407,29 +392,25 @@ void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, } } -void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth, - int BlockHeight, PollyGPUDevicePtr *DevData) { +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { dump_function(); - int ParamOffset = 0; - - CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1); - CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda), - sizeof(DevData->Cuda)); - ParamOffset += sizeof(DevData->Cuda); - CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset); -} - -void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth, - int GridHeight) { - dump_function(); + unsigned GridDimZ = 1; + unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; + CUstream Stream = 0; + void **Extra = 0; - if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) { + CUresult Res; + Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, + BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, + Stream, Parameters, Extra); + if (Res != CUDA_SUCCESS) { fprintf(stdout, "Launching CUDA kernel failed.\n"); exit(-1); } - CudaThreadSynchronizeFcnPtr(); - debug_print("CUDA kernel launched.\n"); } void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { @@ -458,6 +439,12 @@ PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { return DevData; } +void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { + dump_function(); + + return (void *)Allocation->Cuda; +} + void polly_freeContext(PollyGPUContext *Context) { dump_function(); diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h index 38a8382b150..155044b2685 100644 --- a/polly/tools/GPURuntime/GPUJIT.h +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -49,17 +49,25 @@ * PollyGPUDevicePtr *DevArray; * int *HostData; * int MemSize; - * int BlockWidth = 16; - * int BlockHeight = 16; - * int GridWidth = 8; - * int GridHeight = 8; + * + * int GridX = 8; + * int GridY = 8; + * + * int BlockX = 16; + * int BlockY = 16; + * int BlockZ = 1; * * MemSize = 256*64*sizeof(int); * Context = polly_initContext(); * DevArray = polly_allocateMemoryForDevice(MemSize); * Kernel = polly_getKernel(KernelString, KernelName); - * polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData); - * polly_launchKernel(Kernel, GridWidth, GridHeight); + * + * void *Params[1]; + * void *DevPtr = polly_getDevicePtr(DevArray) + * Params[0] = &DevPtr; + * + * polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params); + * * polly_copyFromDeviceToHost(HostData, DevData, MemSize); * polly_freeKernel(Kernel); * polly_freeDeviceMemory(DevArray); @@ -80,10 +88,10 @@ void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, long MemSize); void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, long MemSize); -void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth, - int BlockHeight, PollyGPUDevicePtr *DevData); -void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth, - int GridHeight); +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockSizeX, + unsigned int BlockSizeY, unsigned int BlockSizeZ, + void **Parameters); void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation); void polly_freeContext(PollyGPUContext *Context); #endif /* GPUJIT_H_ */ |

