GPGPU: Do not run mostly sequential kernels in GPU

In case sequential kernels are found deeper in the loop tree than any parallel kernel, the overall scop is probably mostly sequential. Hence, run it on the CPU. llvm-svn: 281849
author: Tobias Grosser <tobias@grosser.es> 2016-09-18 08:31:09 +0000
committer: Tobias Grosser <tobias@grosser.es> 2016-09-18 08:31:09 +0000
commit: bc653f203189e9488751dd74e0c94bf4fb446eed (patch)
tree: 214285b1d15cc0f87f27cd6fa768b128df5c8e60
parent: 82f2af350819fe7bd63ceb828af85ef1c4fabc75 (diff)
download: bcm5719-llvm-bc653f203189e9488751dd74e0c94bf4fb446eed.tar.gz
bcm5719-llvm-bc653f203189e9488751dd74e0c94bf4fb446eed.zip
2 files changed, 131 insertions, 0 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 1830be96af2..6b872c35351 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -163,6 +163,12 @@ public:
   /// occurred which prevents us from generating valid GPU code.
   bool BuildSuccessful = true;
 
+  /// The maximal number of loops surrounding a sequential kernel.
+  unsigned DeepestSequential = 0;
+
+  /// The maximal number of loops surrounding a parallel kernel.
+  unsigned DeepestParallel = 0;
+
 private:
   /// A vector of array base pointers for which a new ScopArrayInfo was created.
   ///
@@ -1179,6 +1185,13 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   isl_id_free(Id);
   isl_ast_node_free(KernelStmt);
 
+  if (Kernel->n_grid > 1)
+    DeepestParallel =
+        std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));
+  else
+    DeepestSequential =
+        std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));
+
   Value *BlockDimX, *BlockDimY, *BlockDimZ;
   std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
 
@@ -2417,6 +2430,12 @@ public:
     NodeBuilder.create(Root);
     NodeBuilder.finalize();
 
+    /// In case a sequential kernel has more surrounding loops as any parallel
+    /// kernel, the SCoP is probably mostly sequential. Hence, there is no
+    /// point in running it on a CPU.
+    if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
+      SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
+
     if (!NodeBuilder.BuildSuccessful)
       SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
   }
diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll
new file mode 100644
index 00000000000..dea4942fd53
--- /dev/null
+++ b/polly/test/GPGPU/mostly-sequential.ll
@@ -0,0 +1,112 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+;
+;
+;    void foo(float A[]) {
+;      for (long i = 0; i < 128; i++)
+;        A[i] += i;
+;
+;      for (long i = 0; i < 128; i++)
+;        for (long j = 0; j < 128; j++)
+;          A[42] += i + j;
+;    }
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(4);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   for (int c0 = 0; c0 <= 127; c0 += 1)
+; CODE-NEXT:     for (int c1 = 0; c1 <= 127; c1 += 1)
+; CODE-NEXT:       {
+; CODE-NEXT:         dim3 k1_dimBlock;
+; CODE-NEXT:         dim3 k1_dimGrid;
+; CODE-NEXT:         kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, c0, c1);
+; CODE-NEXT:         cudaCheckKernel();
+; CODE-NEXT:       }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb4(32 * b0 + t0);
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb14(c0, c1);
+
+; Verify that we identified this kernel as non-profitable.
+; IR: br i1 false, label %polly.start, label %bb3
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb8, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ]
+  %exitcond2 = icmp ne i64 %i.0, 128
+  br i1 %exitcond2, label %bb4, label %bb10
+
+bb4:                                              ; preds = %bb3
+  %tmp = sitofp i64 %i.0 to float
+  %tmp5 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp6 = load float, float* %tmp5, align 4
+  %tmp7 = fadd float %tmp6, %tmp
+  store float %tmp7, float* %tmp5, align 4
+  br label %bb8
+
+bb8:                                              ; preds = %bb4
+  %tmp9 = add nuw nsw i64 %i.0, 1
+  br label %bb3
+
+bb10:                                             ; preds = %bb3
+  br label %bb11
+
+bb11:                                             ; preds = %bb23, %bb10
+  %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ]
+  %exitcond1 = icmp ne i64 %i1.0, 128
+  br i1 %exitcond1, label %bb12, label %bb25
+
+bb12:                                             ; preds = %bb11
+  br label %bb13
+
+bb13:                                             ; preds = %bb20, %bb12
+  %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ]
+  %exitcond = icmp ne i64 %j.0, 128
+  br i1 %exitcond, label %bb14, label %bb22
+
+bb14:                                             ; preds = %bb13
+  %tmp15 = add nuw nsw i64 %i1.0, %j.0
+  %tmp16 = sitofp i64 %tmp15 to float
+  %tmp17 = getelementptr inbounds float, float* %A, i64 42
+  %tmp18 = load float, float* %tmp17, align 4
+  %tmp19 = fadd float %tmp18, %tmp16
+  store float %tmp19, float* %tmp17, align 4
+  br label %bb20
+
+bb20:                                             ; preds = %bb14
+  %tmp21 = add nuw nsw i64 %j.0, 1
+  br label %bb13
+
+bb22:                                             ; preds = %bb13
+  br label %bb23
+
+bb23:                                             ; preds = %bb22
+  %tmp24 = add nuw nsw i64 %i1.0, 1
+  br label %bb11
+
+bb25:                                             ; preds = %bb11
+  ret void
+}
author	Tobias Grosser <tobias@grosser.es>	2016-09-18 08:31:09 +0000
committer	Tobias Grosser <tobias@grosser.es>	2016-09-18 08:31:09 +0000
commit	bc653f203189e9488751dd74e0c94bf4fb446eed (patch)
tree	214285b1d15cc0f87f27cd6fa768b128df5c8e60
parent	82f2af350819fe7bd63ceb828af85ef1c4fabc75 (diff)
download	bcm5719-llvm-bc653f203189e9488751dd74e0c94bf4fb446eed.tar.gz bcm5719-llvm-bc653f203189e9488751dd74e0c94bf4fb446eed.zip