summaryrefslogtreecommitdiffstats
path: root/polly/lib/CodeGen/PPCGCodeGeneration.cpp
diff options
context:
space:
mode:
authorTobias Grosser <tobias@grosser.es>2016-08-05 06:47:43 +0000
committerTobias Grosser <tobias@grosser.es>2016-08-05 06:47:43 +0000
commitc1c6a2a61b42680ea92dbdf15c616688874be188 (patch)
treec0111b6913af9cc239108b53743fe80e6b88afb9 /polly/lib/CodeGen/PPCGCodeGeneration.cpp
parent8de920cf0e511551e2a125444c2b06420ea39975 (diff)
downloadbcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.tar.gz
bcm5719-llvm-c1c6a2a61b42680ea92dbdf15c616688874be188.zip
GPGPU: Add cuda annotations to specify maximal number of threads per block
These annotations ensure that the NVIDIA PTX assembler limits the number of registers used such that we can be certain the resulting kernel can be executed for the number of threads in a thread block that we are planning to use. llvm-svn: 277799
Diffstat (limited to 'polly/lib/CodeGen/PPCGCodeGeneration.cpp')
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp43
1 files changed, 40 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 481e20b54b7..d418300d6f7 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -249,6 +249,21 @@ private:
/// @param FN The function into which to generate the variables.
void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
+ /// Add CUDA annotations to module.
+ ///
+ /// Add a set of CUDA annotations that declares the maximal block dimensions
+ /// that will be used to execute the CUDA kernel. This allows the NVIDIA
+ /// PTX compiler to bound the number of allocated registers to ensure the
+ /// resulting kernel is known to run with up to as many block dimensions
+ /// as specified here.
+ ///
+ /// @param M The module to add the annotations to.
+ /// @param BlockDimX The size of block dimension X.
+ /// @param BlockDimY The size of block dimension Y.
+ /// @param BlockDimZ The size of block dimension Z.
+ void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
+ Value *BlockDimZ);
+
/// Create GPU kernel.
///
/// Code generate the kernel described by @p KernelStmt.
@@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() {
isl_ast_build_free(Build);
}
+void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
+ Value *BlockDimY, Value *BlockDimZ) {
+ auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+ for (auto &F : *M) {
+ if (F.getCallingConv() != CallingConv::PTX_Kernel)
+ continue;
+
+ Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
+
+ Metadata *Elements[] = {
+ ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"),
+ ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
+ ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
+ ValueAsMetadata::get(V[2]),
+ };
+ MDNode *Node = MDNode::get(M->getContext(), Elements);
+ AnnotationNode->addOperand(Node);
+ }
+}
+
void GPUNodeBuilder::freeDeviceArrays() {
for (auto &Array : DeviceAllocations)
createCallFreeDeviceMemory(Array.second);
@@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id_free(Id);
isl_ast_node_free(KernelStmt);
+ Value *BlockDimX, *BlockDimY, *BlockDimZ;
+ std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
+
SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
assert(Kernel->tree && "Device AST of kernel node is empty");
@@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
create(isl_ast_node_copy(Kernel->tree));
Function *F = Builder.GetInsertBlock()->getParent();
+ addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
clearDominators(F);
clearScalarEvolution(F);
clearLoops(F);
@@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
Value *GridDimX, *GridDimY;
std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
- Value *BlockDimX, *BlockDimY, *BlockDimZ;
- std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
-
createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
BlockDimZ, Parameters);
createCallFreeKernel(GPUKernel);
OpenPOWER on IntegriCloud