From 2f3073b5cb883cfeffb86cf582f8548734250620 Mon Sep 17 00:00:00 2001 From: Philipp Schaad Date: Fri, 21 Jul 2017 16:11:06 +0000 Subject: [Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel Summary: Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively. In addition to that, runtime support has been added to execute said SPIR code on Intel GPU's, where the system is equipped with Intel's open source driver Beignet (development version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime flag value to be 'libopencl'. The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex string transformations. Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip). Reviewers: bollu, grosser, Meinersbur, singam-sanjay Reviewed By: grosser, singam-sanjay Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35185 llvm-svn: 308751 --- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 169 +++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 11 deletions(-) (limited to 'polly/lib/CodeGen/PPCGCodeGeneration.cpp') diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 8935aa172f3..ec488488179 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -545,6 +545,11 @@ private: /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Insert function calls to retrieve the SPIR group/local ids. + /// + /// @param The kernel to generate the function calls for. + void insertKernelCallsSPIR(ppcg_kernel *Kernel); + /// Setup the creation of functions referenced by the GPU kernel. /// /// 1. Create new function declarations in GPUModule which are the same as @@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + const char *SpirName = "__gen_ocl_barrier_global"; Function *Sync; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + Sync = M->getFunction(SpirName); + + // If Sync is not available, declare it. + if (!Sync) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + Sync = Function::Create(Ty, Linkage, SpirName, M); + Sync->setCallingConv(CallingConv::SPIR_FUNC); + } + break; case GPUArch::NVPTX64: Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); break; @@ -1668,7 +1687,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); - addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + if (Arch == GPUArch::NVPTX64) + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1725,12 +1745,35 @@ static std::string computeNVPTXDataLayout(bool is64Bit) { return Ret; } +/// Compute the DataLayout string for a SPIR kernel. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeSPIRDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } + + return Ret; +} + Function * GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, SetVector &SubtreeValues) { std::vector Args; std::string Identifier = getKernelFuncName(Kernel->id); + std::vector MemoryType; + for (long i = 0; i < Prog->n_array; i++) { if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; @@ -1739,16 +1782,23 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); Args.push_back(SAI->getElementType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } else { static const int UseGlobalMemory = 1; Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); } } int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - for (long i = 0; i < NumHostIters; i++) + for (long i = 0; i < NumHostIters; i++) { Args.push_back(Builder.getInt64Ty()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } int NumVars = isl_space_dim(Kernel->space, isl_dim_param); @@ -1757,19 +1807,49 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, Value *Val = IDToValue[Id]; isl_id_free(Id); Args.push_back(Val->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } - for (auto *V : SubtreeValues) + for (auto *V : SubtreeValues) { Args.push_back(V->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); + std::vector EmptyStrings; + + for (unsigned int i = 0; i < MemoryType.size(); i++) { + EmptyStrings.push_back(MDString::get(FN->getContext(), "")); + } + + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + FN->setMetadata("kernel_arg_addr_space", + MDNode::get(FN->getContext(), MemoryType)); + FN->setMetadata("kernel_arg_name", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_access_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_base_type", + MDNode::get(FN->getContext(), EmptyStrings)); + } + switch (Arch) { case GPUArch::NVPTX64: FN->setCallingConv(CallingConv::PTX_Kernel); break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + FN->setCallingConv(CallingConv::SPIR_KERNEL); + break; } auto Arg = FN->arg_begin(); @@ -1835,6 +1915,9 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { Intrinsic::ID IntrinsicsTID[3]; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); case GPUArch::NVPTX64: IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; @@ -1866,6 +1949,41 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { } } +void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) { + const char *GroupName[3] = {"__gen_ocl_get_group_id0", + "__gen_ocl_get_group_id1", + "__gen_ocl_get_group_id2"}; + + const char *LocalName[3] = {"__gen_ocl_get_local_id0", + "__gen_ocl_get_local_id1", + "__gen_ocl_get_local_id2"}; + + auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *FN = M->getFunction(Name); + + // If FN is not available, declare it. + if (!FN) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false); + FN = Function::Create(Ty, Linkage, Name, M); + FN->setCallingConv(CallingConv::SPIR_FUNC); + } + + Value *Val = Builder.CreateCall(FN, {}); + Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); + IDToValue[Id] = Val; + KernelIDs.insert(std::unique_ptr(Id)); + }; + + for (int i = 0; i < Kernel->n_grid; ++i) + createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i)); + + for (int i = 0; i < Kernel->n_block; ++i) + createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i)); +} + void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -2004,6 +2122,14 @@ void GPUNodeBuilder::createKernelFunction( GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); break; + case GPUArch::SPIR32: + GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); + break; + case GPUArch::SPIR64: + GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); + break; } Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -2021,7 +2147,16 @@ void GPUNodeBuilder::createKernelFunction( prepareKernelArguments(Kernel, FN); createKernelVariables(Kernel, FN); - insertKernelIntrinsics(Kernel); + + switch (Arch) { + case GPUArch::NVPTX64: + insertKernelIntrinsics(Kernel); + break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + insertKernelCallsSPIR(Kernel); + break; + } } std::string GPUNodeBuilder::createKernelASM() { @@ -2038,6 +2173,13 @@ std::string GPUNodeBuilder::createKernelASM() { break; } break; + case GPUArch::SPIR64: + case GPUArch::SPIR32: + std::string SPIRAssembly; + raw_string_ostream IROstream(SPIRAssembly); + IROstream << *GPUModule; + IROstream.flush(); + return SPIRAssembly; } std::string ErrMsg; @@ -2057,6 +2199,9 @@ std::string GPUNodeBuilder::createKernelASM() { case GPUArch::NVPTX64: subtarget = CudaVersion; break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + llvm_unreachable("No subtarget for SPIR architecture"); } std::unique_ptr TargetM(GPUTarget->createTargetMachine( @@ -2097,13 +2242,15 @@ std::string GPUNodeBuilder::finalizeKernelFunction() { if (DumpKernelIR) outs() << *GPUModule << "\n"; - // Optimize module. - llvm::legacy::PassManager OptPasses; - PassManagerBuilder PassBuilder; - PassBuilder.OptLevel = 3; - PassBuilder.SizeLevel = 0; - PassBuilder.populateModulePassManager(OptPasses); - OptPasses.run(*GPUModule); + if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { + // Optimize module. + llvm::legacy::PassManager OptPasses; + PassManagerBuilder PassBuilder; + PassBuilder.OptLevel = 3; + PassBuilder.SizeLevel = 0; + PassBuilder.populateModulePassManager(OptPasses); + OptPasses.run(*GPUModule); + } std::string Assembly = createKernelASM(); -- cgit v1.2.3