summaryrefslogtreecommitdiffstats
path: root/polly
diff options
context:
space:
mode:
Diffstat (limited to 'polly')
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp43
-rw-r--r--polly/test/GPGPU/cuda-annotations.ll35
2 files changed, 75 insertions, 3 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 481e20b54b7..d418300d6f7 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -249,6 +249,21 @@ private:
/// @param FN The function into which to generate the variables.
void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
+ /// Add CUDA annotations to module.
+ ///
+ /// Add a set of CUDA annotations that declares the maximal block dimensions
+ /// that will be used to execute the CUDA kernel. This allows the NVIDIA
+ /// PTX compiler to bound the number of allocated registers to ensure the
+ /// resulting kernel is known to run with up to as many block dimensions
+ /// as specified here.
+ ///
+ /// @param M The module to add the annotations to.
+ /// @param BlockDimX The size of block dimension X.
+ /// @param BlockDimY The size of block dimension Y.
+ /// @param BlockDimZ The size of block dimension Z.
+ void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
+ Value *BlockDimZ);
+
/// Create GPU kernel.
///
/// Code generate the kernel described by @p KernelStmt.
@@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArrays() {
isl_ast_build_free(Build);
}
+void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
+ Value *BlockDimY, Value *BlockDimZ) {
+ auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+ for (auto &F : *M) {
+ if (F.getCallingConv() != CallingConv::PTX_Kernel)
+ continue;
+
+ Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
+
+ Metadata *Elements[] = {
+ ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"),
+ ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
+ ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
+ ValueAsMetadata::get(V[2]),
+ };
+ MDNode *Node = MDNode::get(M->getContext(), Elements);
+ AnnotationNode->addOperand(Node);
+ }
+}
+
void GPUNodeBuilder::freeDeviceArrays() {
for (auto &Array : DeviceAllocations)
createCallFreeDeviceMemory(Array.second);
@@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id_free(Id);
isl_ast_node_free(KernelStmt);
+ Value *BlockDimX, *BlockDimY, *BlockDimZ;
+ std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
+
SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
assert(Kernel->tree && "Device AST of kernel node is empty");
@@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
create(isl_ast_node_copy(Kernel->tree));
Function *F = Builder.GetInsertBlock()->getParent();
+ addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
clearDominators(F);
clearScalarEvolution(F);
clearLoops(F);
@@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
Value *GridDimX, *GridDimY;
std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
- Value *BlockDimX, *BlockDimY, *BlockDimZ;
- std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
-
createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
BlockDimZ, Parameters);
createCallFreeKernel(GPUKernel);
diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll
new file mode 100644
index 00000000000..569a6c41576
--- /dev/null
+++ b/polly/test/GPGPU/cuda-annotations.ll
@@ -0,0 +1,35 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %n) #0 {
+
+; KERNEL: !nvvm.annotations = !{!0}
+
+; KERNEL: !0 = !{void (i8*, i64)* @kernel_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i64* %A, i64 %n) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb6, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
+ %tmp = icmp slt i64 %i.0, %n
+ br i1 %tmp, label %bb2, label %bb8
+
+bb2: ; preds = %bb1
+ %tmp3 = getelementptr inbounds i64, i64* %A, i64 %i.0
+ %tmp4 = load i64, i64* %tmp3, align 8
+ %tmp5 = add nsw i64 %tmp4, 100
+ store i64 %tmp5, i64* %tmp3, align 8
+ br label %bb6
+
+bb6: ; preds = %bb2
+ %tmp7 = add nuw nsw i64 %i.0, 1
+ br label %bb1
+
+bb8: ; preds = %bb1
+ ret void
+}
OpenPOWER on IntegriCloud