diff options
Diffstat (limited to 'polly')
| -rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 43 | ||||
| -rw-r--r-- | polly/test/GPGPU/cuda-annotations.ll | 35 |
2 files changed, 75 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 481e20b54b7..d418300d6f7 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -249,6 +249,21 @@ private: /// @param FN The function into which to generate the variables. void createKernelVariables(ppcg_kernel *Kernel, Function *FN); + /// Add CUDA annotations to module. + /// + /// Add a set of CUDA annotations that declares the maximal block dimensions + /// that will be used to execute the CUDA kernel. This allows the NVIDIA + /// PTX compiler to bound the number of allocated registers to ensure the + /// resulting kernel is known to run with up to as many block dimensions + /// as specified here. + /// + /// @param M The module to add the annotations to. + /// @param BlockDimX The size of block dimension X. + /// @param BlockDimY The size of block dimension Y. + /// @param BlockDimZ The size of block dimension Z. + void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, + Value *BlockDimZ); + /// Create GPU kernel. /// /// Code generate the kernel described by @p KernelStmt. @@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() { isl_ast_build_free(Build); } +void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, + Value *BlockDimY, Value *BlockDimZ) { + auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); + + for (auto &F : *M) { + if (F.getCallingConv() != CallingConv::PTX_Kernel) + continue; + + Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; + + Metadata *Elements[] = { + ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), + ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), + ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), + ValueAsMetadata::get(V[2]), + }; + MDNode *Node = MDNode::get(M->getContext(), Elements); + AnnotationNode->addOperand(Node); + } +} + void GPUNodeBuilder::freeDeviceArrays() { for (auto &Array : DeviceAllocations) createCallFreeDeviceMemory(Array.second); @@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id_free(Id); isl_ast_node_free(KernelStmt); + Value *BlockDimX, *BlockDimY, *BlockDimZ; + std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); + SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { create(isl_ast_node_copy(Kernel->tree)); Function *F = Builder.GetInsertBlock()->getParent(); + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { Value *GridDimX, *GridDimY; std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); - Value *BlockDimX, *BlockDimY, *BlockDimZ; - std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, Parameters); createCallFreeKernel(GPUKernel); diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll new file mode 100644 index 00000000000..569a6c41576 --- /dev/null +++ b/polly/test/GPGPU/cuda-annotations.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=KERNEL %s + +; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %n) #0 { + +; KERNEL: !nvvm.annotations = !{!0} + +; KERNEL: !0 = !{void (i8*, i64)* @kernel_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1} + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(i64* %A, i64 %n) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %tmp = icmp slt i64 %i.0, %n + br i1 %tmp, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp3 = getelementptr inbounds i64, i64* %A, i64 %i.0 + %tmp4 = load i64, i64* %tmp3, align 8 + %tmp5 = add nsw i64 %tmp4, 100 + store i64 %tmp5, i64* %tmp3, align 8 + br label %bb6 + +bb6: ; preds = %bb2 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} |

