diff options
-rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 24 | ||||
-rw-r--r-- | polly/test/GPGPU/phi-nodes-in-kernel.ll | 87 |
2 files changed, 101 insertions, 10 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 32858fc213d..2ca25ab1238 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -722,14 +722,14 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, auto ScopArray = (ScopArrayInfo *)(Array->user); Value *Size = getArraySize(Array); - Value *HostPtr = ScopArray->getBasePtr(); - Value *DevPtr = DeviceAllocations[ScopArray]; - if (gpu_array_is_scalar(Array)) { - HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); - Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); - } + Value *HostPtr; + + if (gpu_array_is_scalar(Array)) + HostPtr = BlockGen.getOrCreateAlloca(ScopArray); + else + HostPtr = ScopArray->getBasePtr(); HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); @@ -1074,6 +1074,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { Instruction &HostInsertPoint = *Builder.GetInsertPoint(); IslExprBuilder::IDToValueTy HostIDs = IDToValue; ValueMapT HostValueMap = ValueMap; + BlockGenerator::ScalarAllocaMapTy HostScalarMap = ScalarMap; + BlockGenerator::ScalarAllocaMapTy HostPHIOpMap = PHIOpMap; + ScalarMap.clear(); + PHIOpMap.clear(); SetVector<const Loop *> Loops; @@ -1102,9 +1106,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { Builder.SetInsertPoint(&HostInsertPoint); IDToValue = HostIDs; - ValueMap = HostValueMap; - ScalarMap.clear(); - PHIOpMap.clear(); + ValueMap = std::move(HostValueMap); + ScalarMap = std::move(HostScalarMap); + PHIOpMap = std::move(HostPHIOpMap); EscapeMap.clear(); IDToSAI.clear(); Annotator.resetAlternativeAliasBases(); @@ -1283,7 +1287,7 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { continue; } - Value *Alloca = BlockGen.getOrCreateScalarAlloca(SAI->getBasePtr()); + Value *Alloca = BlockGen.getOrCreateAlloca(SAI); Value *ArgPtr = &*Arg; Type *TypePtr = SAI->getElementType()->getPointerTo(); Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll new file mode 100644 index 00000000000..5befa36c6d5 --- /dev/null +++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll @@ -0,0 +1,87 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR + +; REQUIRES: pollyacc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CODE: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_out_l_055__phi, &MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(2); +; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_out_l_055__phi, dev_MemRef_out_l_055, dev_MemRef_c); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: if (32 * b0 + t0 <= 48) { +; CODE-NEXT: if (b0 == 1 && t0 == 16) +; CODE-NEXT: Stmt_for_cond1_preheader(0); +; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); +; CODE-NEXT: if (b0 == 1 && t0 == 16) +; CODE-NEXT: Stmt_for_cond15_for_cond12_loopexit_crit_edge(0); +; CODE-NEXT: } + +; IR: %1 = bitcast i32* %out_l.055.phiops to i8* +; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) + +; IR: %14 = bitcast i32* %out_l.055.phiops to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4) +; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4) + +; KERNEL-IR: entry: +; KERNEL-IR-NEXT: %out_l.055.s2a = alloca i32 +; KERNEL-IR-NEXT: %out_l.055.phiops = alloca i32 +; KERNEL-IR-NEXT: %1 = bitcast i8* %MemRef_out_l_055__phi to i32* +; KERNEL-IR-NEXT: %2 = load i32, i32* %1 +; KERNEL-IR-NEXT: store i32 %2, i32* %out_l.055.phiops +; KERNEL-IR-NEXT: %3 = bitcast i8* %MemRef_out_l_055 to i32* +; KERNEL-IR-NEXT: %4 = load i32, i32* %3 +; KERNEL-IR-NEXT: store i32 %4, i32* %out_l.055.s2a + + +define void @kernel_dynprog([50 x i32]* %c) { +entry: + %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry + %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ] + %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ] + br label %for.body17 + +for.cond15.for.cond12.loopexit_crit_edge: ; preds = %for.body17 + %tmp = load i32, i32* %arrayidx77, align 4 + %add78 = add nsw i32 %tmp, %out_l.055 + %inc80 = add nuw nsw i32 %iter.054, 1 + br i1 false, label %for.cond1.preheader, label %for.end81 + +for.body17: ; preds = %for.body17, %for.cond1.preheader + %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ] + %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71 + store i32 undef, i32* %arrayidx69, align 4 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32 + %exitcond75 = icmp ne i32 %lftr.wideiv74, 50 + br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge + +for.end81: ; preds = %for.cond15.for.cond12.loopexit_crit_edge + ret void +} |