diff options
-rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 82 | ||||
-rw-r--r-- | polly/test/GPGPU/kernel-params-only-some-arrays.ll | 8 | ||||
-rw-r--r-- | polly/test/GPGPU/non-zero-array-offset.ll | 8 |
3 files changed, 11 insertions, 87 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 0499003793a..444b043bc75 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -2658,77 +2658,6 @@ public: return Names; } - /// Remove unreferenced parameter dimensions from union_map. - isl::union_map removeUnusedParameters(isl::union_map UMap) { - auto New = isl::union_map::empty(isl::space(UMap.get_ctx(), 0, 0)); - - auto RemoveUnusedDims = [&New](isl::map S) -> isl::stat { - int Removed = 0; - int NumDims = S.dim(isl::dim::param); - for (long i = 0; i < NumDims; i++) { - const int Dim = i - Removed; - if (!S.involves_dims(isl::dim::param, Dim, 1)) { - S = S.remove_dims(isl::dim::param, Dim, 1); - Removed++; - } - } - New = New.unite(S); - return isl::stat::ok; - }; - - UMap.foreach_map(RemoveUnusedDims); - return New; - } - - /// Remove unreferenced parameter dimensions from union_set. - isl::union_set removeUnusedParameters(isl::union_set USet) { - auto New = isl::union_set::empty(isl::space(USet.get_ctx(), 0, 0)); - - auto RemoveUnusedDims = [&New](isl::set S) -> isl::stat { - int Removed = 0; - int NumDims = S.dim(isl::dim::param); - for (long i = 0; i < NumDims; i++) { - const int Dim = i - Removed; - if (!S.involves_dims(isl::dim::param, Dim, 1)) { - S = S.remove_dims(isl::dim::param, Dim, 1); - Removed++; - } - } - New = New.unite(S); - return isl::stat::ok; - }; - - USet.foreach_set(RemoveUnusedDims); - return New; - } - - /// Simplify PPCG scop to improve compile time. - /// - /// We drop unused parameter dimensions to reduce the size of the sets we are - /// working with. Especially the computed dependences tend to accumulate a lot - /// of parameters that are present in the input memory accesses, but often are - /// not necessary to express the actual dependences. As isl represents maps - /// and sets with dense matrices, reducing the dimensionality of isl sets - /// commonly reduces code generation performance. - void simplifyPPCGScop(ppcg_scop *PPCGScop) { - PPCGScop->domain = - removeUnusedParameters(isl::manage(PPCGScop->domain)).release(); - - PPCGScop->dep_forced = - removeUnusedParameters(isl::manage(PPCGScop->dep_forced)).release(); - PPCGScop->dep_false = - removeUnusedParameters(isl::manage(PPCGScop->dep_false)).release(); - PPCGScop->dep_flow = - removeUnusedParameters(isl::manage(PPCGScop->dep_flow)).release(); - PPCGScop->tagged_dep_flow = - removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_flow)) - .release(); - - PPCGScop->tagged_dep_order = - removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_order)) - .release(); - } - /// Create a new PPCG scop from the current scop. /// /// The PPCG scop is initialized with data from the current polly::Scop. From @@ -2786,7 +2715,6 @@ public: compute_tagger(PPCGScop); compute_dependences(PPCGScop); eliminate_dead_code(PPCGScop); - simplifyPPCGScop(PPCGScop); return PPCGScop; } @@ -3228,15 +3156,11 @@ public: isl_schedule *Schedule = get_schedule(PPCGGen); - /// Copy to and from device functions may introduce new parameters, which - /// must be present in the schedule tree root for code generation. Hence, - /// we ensure that all possible parameters are introduced from this point. - if (!PollyManagedMemory) - Schedule = - isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); - int has_permutable = has_any_permutable_node(Schedule); + Schedule = + isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); + if (!has_permutable || has_permutable < 0) { Schedule = isl_schedule_free(Schedule); DEBUG(dbgs() << getUniqueScopName(S) diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll index 4692f732275..6f28b7be08e 100644 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -21,7 +21,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -36,7 +36,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -47,13 +47,13 @@ ; KERNEL-NEXT: } -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll index 9ad68ae0ba9..dcab25ea686 100644 --- a/polly/test/GPGPU/non-zero-array-offset.ll +++ b/polly/test/GPGPU/non-zero-array-offset.ll @@ -12,14 +12,14 @@ ; CODE: dim3 k0_dimBlock(8); ; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B); +; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: { ; CODE-NEXT: dim3 k1_dimBlock(8); ; CODE-NEXT: dim3 k1_dimGrid(1); -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A); +; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_B); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } @@ -27,10 +27,10 @@ ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE: # kernel0 -; CODE-NEXT: Stmt_bb3(t0); +; CODE-NEXT: Stmt_bb11(t0); ; CODE: # kernel1 -; CODE-NEXT: Stmt_bb11(t0); +; CODE-NEXT: Stmt_bb3(t0); ; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32) ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32) |