[GPGPU] Support PHI nodes used in GPU kernel

Ensure the right scalar allocations are used as the host location of data transfers. For the device code, we clear the allocation cache before device code generation to be able to generate new device-specific allocation and we need to make sure to add back the old host allocations as soon as the device code generation is finished. llvm-svn: 278126
author: Tobias Grosser <tobias@grosser.es> 2016-08-09 15:35:06 +0000
committer: Tobias Grosser <tobias@grosser.es> 2016-08-09 15:35:06 +0000
commit: b06ff4574e22bf997aa741bc09fc143c265b1816 (patch)
tree: a6ee4fa01aaf6cbc134455f2c75c87b565091f50
parent: 750160e2605af4212727e254f863bbbbe0353174 (diff)
download: bcm5719-llvm-b06ff4574e22bf997aa741bc09fc143c265b1816.tar.gz
bcm5719-llvm-b06ff4574e22bf997aa741bc09fc143c265b1816.zip
2 files changed, 101 insertions, 10 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 32858fc213d..2ca25ab1238 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -722,14 +722,14 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
   auto ScopArray = (ScopArrayInfo *)(Array->user);
 
   Value *Size = getArraySize(Array);
-  Value *HostPtr = ScopArray->getBasePtr();
-
   Value *DevPtr = DeviceAllocations[ScopArray];
 
-  if (gpu_array_is_scalar(Array)) {
-    HostPtr = Builder.CreateAlloca(ScopArray->getElementType());
-    Builder.CreateStore(ScopArray->getBasePtr(), HostPtr);
-  }
+  Value *HostPtr;
+
+  if (gpu_array_is_scalar(Array))
+    HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
+  else
+    HostPtr = ScopArray->getBasePtr();
 
   HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
 
@@ -1074,6 +1074,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   Instruction &HostInsertPoint = *Builder.GetInsertPoint();
   IslExprBuilder::IDToValueTy HostIDs = IDToValue;
   ValueMapT HostValueMap = ValueMap;
+  BlockGenerator::ScalarAllocaMapTy HostScalarMap = ScalarMap;
+  BlockGenerator::ScalarAllocaMapTy HostPHIOpMap = PHIOpMap;
+  ScalarMap.clear();
+  PHIOpMap.clear();
 
   SetVector<const Loop *> Loops;
 
@@ -1102,9 +1106,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   Builder.SetInsertPoint(&HostInsertPoint);
   IDToValue = HostIDs;
 
-  ValueMap = HostValueMap;
-  ScalarMap.clear();
-  PHIOpMap.clear();
+  ValueMap = std::move(HostValueMap);
+  ScalarMap = std::move(HostScalarMap);
+  PHIOpMap = std::move(HostPHIOpMap);
   EscapeMap.clear();
   IDToSAI.clear();
   Annotator.resetAlternativeAliasBases();
@@ -1283,7 +1287,7 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
       continue;
     }
 
-    Value *Alloca = BlockGen.getOrCreateScalarAlloca(SAI->getBasePtr());
+    Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
     Value *ArgPtr = &*Arg;
     Type *TypePtr = SAI->getElementType()->getPointerTo();
     Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
new file mode 100644
index 00000000000..5befa36c6d5
--- /dev/null
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@@ -0,0 +1,87 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-IR
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CODE: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_out_l_055__phi, &MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(2);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_out_l_055__phi, dev_MemRef_out_l_055, dev_MemRef_c);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: if (32 * b0 + t0 <= 48) {
+; CODE-NEXT:   if (b0 == 1 && t0 == 16)
+; CODE-NEXT:     Stmt_for_cond1_preheader(0);
+; CODE-NEXT:   Stmt_for_body17(0, 32 * b0 + t0);
+; CODE-NEXT:   if (b0 == 1 && t0 == 16)
+; CODE-NEXT:     Stmt_for_cond15_for_cond12_loopexit_crit_edge(0);
+; CODE-NEXT: }
+
+; IR:      %1 = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
+
+; IR:      %14 = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4)
+; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4)
+
+; KERNEL-IR: entry:
+; KERNEL-IR-NEXT:   %out_l.055.s2a = alloca i32
+; KERNEL-IR-NEXT:   %out_l.055.phiops = alloca i32
+; KERNEL-IR-NEXT:   %1 = bitcast i8* %MemRef_out_l_055__phi to i32*
+; KERNEL-IR-NEXT:   %2 = load i32, i32* %1
+; KERNEL-IR-NEXT:   store i32 %2, i32* %out_l.055.phiops
+; KERNEL-IR-NEXT:   %3 = bitcast i8* %MemRef_out_l_055 to i32*
+; KERNEL-IR-NEXT:   %4 = load i32, i32* %3
+; KERNEL-IR-NEXT:   store i32 %4, i32* %out_l.055.s2a
+
+
+define void @kernel_dynprog([50 x i32]* %c) {
+entry:
+  %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry
+  %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ]
+  %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ]
+  br label %for.body17
+
+for.cond15.for.cond12.loopexit_crit_edge:         ; preds = %for.body17
+  %tmp = load i32, i32* %arrayidx77, align 4
+  %add78 = add nsw i32 %tmp, %out_l.055
+  %inc80 = add nuw nsw i32 %iter.054, 1
+  br i1 false, label %for.cond1.preheader, label %for.end81
+
+for.body17:                                       ; preds = %for.body17, %for.cond1.preheader
+  %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ]
+  %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71
+  store i32 undef, i32* %arrayidx69, align 4
+  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+  %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32
+  %exitcond75 = icmp ne i32 %lftr.wideiv74, 50
+  br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge
+
+for.end81:                                        ; preds = %for.cond15.for.cond12.loopexit_crit_edge
+  ret void
+}
author	Tobias Grosser <tobias@grosser.es>	2016-08-09 15:35:06 +0000
committer	Tobias Grosser <tobias@grosser.es>	2016-08-09 15:35:06 +0000
commit	b06ff4574e22bf997aa741bc09fc143c265b1816 (patch)
tree	a6ee4fa01aaf6cbc134455f2c75c87b565091f50
parent	750160e2605af4212727e254f863bbbbe0353174 (diff)
download	bcm5719-llvm-b06ff4574e22bf997aa741bc09fc143c265b1816.tar.gz bcm5719-llvm-b06ff4574e22bf997aa741bc09fc143c265b1816.zip