diff options
Diffstat (limited to 'polly/lib/CodeGen/PPCGCodeGeneration.cpp')
| -rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 43 |
1 files changed, 40 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 481e20b54b7..d418300d6f7 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -249,6 +249,21 @@ private: /// @param FN The function into which to generate the variables. void createKernelVariables(ppcg_kernel *Kernel, Function *FN); + /// Add CUDA annotations to module. + /// + /// Add a set of CUDA annotations that declares the maximal block dimensions + /// that will be used to execute the CUDA kernel. This allows the NVIDIA + /// PTX compiler to bound the number of allocated registers to ensure the + /// resulting kernel is known to run with up to as many block dimensions + /// as specified here. + /// + /// @param M The module to add the annotations to. + /// @param BlockDimX The size of block dimension X. + /// @param BlockDimY The size of block dimension Y. + /// @param BlockDimZ The size of block dimension Z. + void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, + Value *BlockDimZ); + /// Create GPU kernel. /// /// Code generate the kernel described by @p KernelStmt. @@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() { isl_ast_build_free(Build); } +void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ) { + auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); + + for (auto &F : *M) { + if (F.getCallingConv() != CallingConv::PTX_Kernel) + continue; + + Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; + + Metadata *Elements[] = { + ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), + ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), + ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), + ValueAsMetadata::get(V[2]), + }; + MDNode *Node = MDNode::get(M->getContext(), Elements); + AnnotationNode->addOperand(Node); + } +} + void GPUNodeBuilder::freeDeviceArrays() { for (auto &Array : DeviceAllocations) createCallFreeDeviceMemory(Array.second); @@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id_free(Id); isl_ast_node_free(KernelStmt); + Value *BlockDimX, *BlockDimY, *BlockDimZ; + std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); + SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { create(isl_ast_node_copy(Kernel->tree)); Function *F = Builder.GetInsertBlock()->getParent(); + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { Value *GridDimX, *GridDimY; std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); - Value *BlockDimX, *BlockDimY, *BlockDimZ; - std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, Parameters); createCallFreeKernel(GPUKernel); |

