GPGPU: Add cuda annotations to specify maximal number of threads per block

These annotations ensure that the NVIDIA PTX assembler limits the number of registers used such that we can be certain the resulting kernel can be executed for the number of threads in a thread block that we are planning to use. llvm-svn: 277799
author: Tobias Grosser <tobias@grosser.es> 2016-08-05 06:47:43 +0000
committer: Tobias Grosser <tobias@grosser.es> 2016-08-05 06:47:43 +0000
commit: c1c6a2a61b42680ea92dbdf15c616688874be188 (patch)
tree: c0111b6913af9cc239108b53743fe80e6b88afb9 /polly/lib/CodeGen/PPCGCodeGeneration.cpp
parent: 8de920cf0e511551e2a125444c2b06420ea39975 (diff)
download: bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.tar.gz
bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.zip
1 files changed, 40 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 481e20b54b7..d418300d6f7 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -249,6 +249,21 @@ private:
   /// @param FN            The function into which to generate the variables.
   void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
 
+  /// Add CUDA annotations to module.
+  ///
+  /// Add a set of CUDA annotations that declares the maximal block dimensions
+  /// that will be used to execute the CUDA kernel. This allows the NVIDIA
+  /// PTX compiler to bound the number of allocated registers to ensure the
+  /// resulting kernel is known to run with up to as many block dimensions
+  /// as specified here.
+  ///
+  /// @param M         The module to add the annotations to.
+  /// @param BlockDimX The size of block dimension X.
+  /// @param BlockDimY The size of block dimension Y.
+  /// @param BlockDimZ The size of block dimension Z.
+  void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
+                          Value *BlockDimZ);
+
   /// Create GPU kernel.
   ///
   /// Code generate the kernel described by @p KernelStmt.
@@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() {
   isl_ast_build_free(Build);
 }
 
+void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
+                                        Value *BlockDimY, Value *BlockDimZ) {
+  auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+  for (auto &F : *M) {
+    if (F.getCallingConv() != CallingConv::PTX_Kernel)
+      continue;
+
+    Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
+
+    Metadata *Elements[] = {
+        ValueAsMetadata::get(&F),   MDString::get(M->getContext(), "maxntidx"),
+        ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
+        ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
+        ValueAsMetadata::get(V[2]),
+    };
+    MDNode *Node = MDNode::get(M->getContext(), Elements);
+    AnnotationNode->addOperand(Node);
+  }
+}
+
 void GPUNodeBuilder::freeDeviceArrays() {
   for (auto &Array : DeviceAllocations)
     createCallFreeDeviceMemory(Array.second);
@@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   isl_id_free(Id);
   isl_ast_node_free(KernelStmt);
 
+  Value *BlockDimX, *BlockDimY, *BlockDimZ;
+  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
+
   SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
 
   assert(Kernel->tree && "Device AST of kernel node is empty");
@@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   create(isl_ast_node_copy(Kernel->tree));
 
   Function *F = Builder.GetInsertBlock()->getParent();
+  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
   clearDominators(F);
   clearScalarEvolution(F);
   clearLoops(F);
@@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   Value *GridDimX, *GridDimY;
   std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
 
-  Value *BlockDimX, *BlockDimY, *BlockDimZ;
-  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
-
   createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
                          BlockDimZ, Parameters);
   createCallFreeKernel(GPUKernel);
author	Tobias Grosser <tobias@grosser.es>	2016-08-05 06:47:43 +0000
committer	Tobias Grosser <tobias@grosser.es>	2016-08-05 06:47:43 +0000
commit	c1c6a2a61b42680ea92dbdf15c616688874be188 (patch)
tree	c0111b6913af9cc239108b53743fe80e6b88afb9 /polly/lib/CodeGen/PPCGCodeGeneration.cpp
parent	8de920cf0e511551e2a125444c2b06420ea39975 (diff)
download	bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.tar.gz bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.zip