diff options
| author | Tobias Grosser <tobias@grosser.es> | 2016-08-05 06:47:43 +0000 |
|---|---|---|
| committer | Tobias Grosser <tobias@grosser.es> | 2016-08-05 06:47:43 +0000 |
| commit | c1c6a2a61b42680ea92dbdf15c616688874be188 (patch) | |
| tree | c0111b6913af9cc239108b53743fe80e6b88afb9 /polly/lib/CodeGen/PPCGCodeGeneration.cpp | |
| parent | 8de920cf0e511551e2a125444c2b06420ea39975 (diff) | |
| download | bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.tar.gz bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.zip | |
GPGPU: Add cuda annotations to specify maximal number of threads per block
These annotations ensure that the NVIDIA PTX assembler limits the number of
registers used such that we can be certain the resulting kernel can be executed
for the number of threads in a thread block that we are planning to use.
llvm-svn: 277799
Diffstat (limited to 'polly/lib/CodeGen/PPCGCodeGeneration.cpp')
| -rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 43 |
1 files changed, 40 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 481e20b54b7..d418300d6f7 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -249,6 +249,21 @@ private: /// @param FN The function into which to generate the variables. void createKernelVariables(ppcg_kernel *Kernel, Function *FN); + /// Add CUDA annotations to module. + /// + /// Add a set of CUDA annotations that declares the maximal block dimensions + /// that will be used to execute the CUDA kernel. This allows the NVIDIA + /// PTX compiler to bound the number of allocated registers to ensure the + /// resulting kernel is known to run with up to as many block dimensions + /// as specified here. + /// + /// @param M The module to add the annotations to. + /// @param BlockDimX The size of block dimension X. + /// @param BlockDimY The size of block dimension Y. + /// @param BlockDimZ The size of block dimension Z. + void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, + Value *BlockDimZ); + /// Create GPU kernel. /// /// Code generate the kernel described by @p KernelStmt. @@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() { isl_ast_build_free(Build); } +void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ) { + auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); + + for (auto &F : *M) { + if (F.getCallingConv() != CallingConv::PTX_Kernel) + continue; + + Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; + + Metadata *Elements[] = { + ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), + ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), + ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), + ValueAsMetadata::get(V[2]), + }; + MDNode *Node = MDNode::get(M->getContext(), Elements); + AnnotationNode->addOperand(Node); + } +} + void GPUNodeBuilder::freeDeviceArrays() { for (auto &Array : DeviceAllocations) createCallFreeDeviceMemory(Array.second); @@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id_free(Id); isl_ast_node_free(KernelStmt); + Value *BlockDimX, *BlockDimY, *BlockDimZ; + std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); + SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { create(isl_ast_node_copy(Kernel->tree)); Function *F = Builder.GetInsertBlock()->getParent(); + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { Value *GridDimX, *GridDimY; std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); - Value *BlockDimX, *BlockDimY, *BlockDimZ; - std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, Parameters); createCallFreeKernel(GPUKernel); |

