diff options
-rw-r--r-- | polly/lib/Analysis/ScopInfo.cpp | 38 | ||||
-rw-r--r-- | polly/lib/CodeGen/IslNodeBuilder.cpp | 23 | ||||
-rw-r--r-- | polly/test/Isl/CodeGen/invariant_load_complex_condition.ll | 70 | ||||
-rw-r--r-- | polly/test/ScopInfo/invariant_load_complex_condition.ll | 73 |
4 files changed, 111 insertions, 93 deletions
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index a1643f8b01a..88206f04252 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -94,6 +94,12 @@ static int const MaxDisjunctsInDomain = 20; // number of disjunct when adding non-convex sets to the context. static int const MaxDisjunctsInContext = 4; +// The maximal number of dimensions we allow during invariant load construction. +// More complex access ranges will result in very high compile time and are also +// unlikely to result in good code. This value is very high and should only +// trigger for corner cases (e.g., the "dct_luma" function in h264, SPEC2006). +static int const MaxDimensionsInAccessRange = 9; + static cl::opt<int> OptComputeOut("polly-analysis-computeout", cl::desc("Bound the scop analysis by a maximal amount of " @@ -4024,6 +4030,35 @@ void Scop::addInvariantLoads(ScopStmt &Stmt, InvariantAccessesTy &InvMAs) { isl_set_free(DomainCtx); } +/// Check if an access range is too complex. +/// +/// An access range is too complex, if it contains either many disjuncts or +/// very complex expressions. As a simple heuristic, we assume if a set to +/// be too complex if the sum of existentially quantified dimensions and +/// set dimensions is larger than a threshold. This reliably detects both +/// sets with many disjuncts as well as sets with many divisions as they +/// arise in h264. +/// +/// @param AccessRange The range to check for complexity. +/// +/// @returns True if the access range is too complex. +static bool isAccessRangeTooComplex(isl::set AccessRange) { + unsigned NumTotalDims = 0; + + auto CountDimensions = [&NumTotalDims](isl::basic_set BSet) -> isl::stat { + NumTotalDims += BSet.dim(isl::dim::div); + NumTotalDims += BSet.dim(isl::dim::set); + return isl::stat::ok; + }; + + AccessRange.foreach_basic_set(CountDimensions); + + if (NumTotalDims > MaxDimensionsInAccessRange) + return true; + + return false; +} + isl::set Scop::getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes) { // TODO: Loads that are not loop carried, hence are in a statement with // zero iterators, are by construction invariant, though we @@ -4072,6 +4107,9 @@ isl::set Scop::getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes) { SafeToLoad = AccessRelation.range(); } + if (isAccessRangeTooComplex(AccessRelation.range())) + return nullptr; + isl::union_map Written = Writes.intersect_range(SafeToLoad); isl::set WrittenCtx = Written.params(); bool IsWritten = !WrittenCtx.is_empty(); diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index db23319183a..7783818e17b 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -53,12 +53,6 @@ using namespace llvm; STATISTIC(VersionedScops, "Number of SCoPs that required versioning."); -// The maximal number of dimensions we allow during invariant load construction. -// More complex access ranges will result in very high compile time and are also -// unlikely to result in good code. This value is very high and should only -// trigger for corner cases (e.g., the "dct_luma" function in h264, SPEC2006). -static int const MaxDimensionsInAccessRange = 9; - static cl::opt<bool> PollyGenerateRTCPrint( "polly-codegen-emit-rtc-print", cl::desc("Emit code that prints the runtime check result dynamically."), @@ -1134,26 +1128,9 @@ bool IslNodeBuilder::materializeFortranArrayOutermostDimension() { return true; } -/// Add the number of dimensions in @p BS to @p U. -static isl_stat countTotalDims(__isl_take isl_basic_set *BS, void *U) { - unsigned *NumTotalDim = static_cast<unsigned *>(U); - *NumTotalDim += isl_basic_set_total_dim(BS); - isl_basic_set_free(BS); - return isl_stat_ok; -} - Value *IslNodeBuilder::preloadUnconditionally(isl_set *AccessRange, isl_ast_build *Build, Instruction *AccInst) { - - // TODO: This check could be performed in the ScopInfo already. - unsigned NumTotalDim = 0; - isl_set_foreach_basic_set(AccessRange, countTotalDims, &NumTotalDim); - if (NumTotalDim > MaxDimensionsInAccessRange) { - isl_set_free(AccessRange); - return nullptr; - } - isl_pw_multi_aff *PWAccRel = isl_pw_multi_aff_from_set(AccessRange); isl_ast_expr *Access = isl_ast_build_access_from_pw_multi_aff(Build, PWAccRel); diff --git a/polly/test/Isl/CodeGen/invariant_load_complex_condition.ll b/polly/test/Isl/CodeGen/invariant_load_complex_condition.ll deleted file mode 100644 index 66ad94f8e8c..00000000000 --- a/polly/test/Isl/CodeGen/invariant_load_complex_condition.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s -; -; Extracted from h246 in SPEC 2006. -; -; TODO: We check that we do compile this benchmark in reasonable time. -; To do so we currently bail out due to the complex access range -; (multiple modulos) of the invariant load. -; -; FIXME: We should not bail with a false RTC here. -; -; CHECK-LABEL: polly.preload.begin: -; CHECK-NOT: br i1 -; CHECK-NOT: br label -; CHECK: br i1 false, label %polly.start, label %entry.split -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -%struct.IP = type { i32****, i32***, %struct.P, %struct.S, %struct.m } -%struct.P = type { i32 } -%struct.S = type { i32 } -%struct.D = type { i32 } -%struct.B = type { i32 } -%struct.E = type { i32 } -%struct.s = type { i32 } -%struct.M = type { i32 } -%struct.C = type { i32 } -%struct.T = type { i32 } -%struct.R = type { i32 } -%struct.m = type { i32 } -%struct.d = type { i32 } - -@img = external global %struct.IP*, align 8 - -; Function Attrs: nounwind uwtable -define void @dct_luma(i32 %block_x, i32 %block_y) #0 { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %div = sdiv i32 %block_x, 4 - %div1 = sdiv i32 %block_y, 4 - %rem = srem i32 %div1, 2 - %mul4 = shl nsw i32 %rem, 1 - %rem5 = srem i32 %div, 2 - %add6 = add nsw i32 %mul4, %rem5 - %idxprom = sext i32 %add6 to i64 - %0 = load %struct.IP*, %struct.IP** @img, align 8 - %cofAC = getelementptr inbounds %struct.IP, %struct.IP* %0, i32 0, i32 0 - %1 = load i32****, i32***** %cofAC, align 8 - %arrayidx = getelementptr inbounds i32***, i32**** %1, i64 0 - %2 = load i32***, i32**** %arrayidx, align 8 - %arrayidx8 = getelementptr inbounds i32**, i32*** %2, i64 %idxprom - %3 = load i32**, i32*** %arrayidx8, align 8 - %mb_data = getelementptr inbounds %struct.IP, %struct.IP* %0, i64 0, i32 4 - %4 = load %struct.m, %struct.m* %mb_data, align 8 - br i1 false, label %land.rhs, label %land.end - -land.rhs: ; preds = %entry.split - br label %land.end - -land.end: ; preds = %land.rhs, %entry.split - %5 = phi i1 [ false, %entry.split ], [ undef, %land.rhs ] - br i1 %5, label %for.cond104.preheader, label %for.cond34.preheader - -for.cond34.preheader: ; preds = %land.end - ret void - -for.cond104.preheader: ; preds = %land.end - ret void -} diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll new file mode 100644 index 00000000000..f5456564a8b --- /dev/null +++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll @@ -0,0 +1,73 @@ +; RUN: opt %loadPolly -S -polly-scops -analyze \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.IP = type { i32****, i32***, %struct.P, %struct.S, %struct.m } +%struct.P = type { i32 } +%struct.S = type { i32 } +%struct.D = type { i32 } +%struct.B = type { i32 } +%struct.E = type { i32 } +%struct.s = type { i32 } +%struct.M = type { i32 } +%struct.C = type { i32 } +%struct.T = type { i32 } +%struct.R = type { i32 } +%struct.m = type { i32 } +%struct.d = type { i32 } + + +; Verify that we do not invariant load hoist very complex conditions. + +; CHECK: Statements { +; CHECK-NEXT: Stmt_entry_split +; CHECK-NEXT: Domain := +; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] }; +; CHECK-NEXT: Schedule := +; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> [] }; +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> MemRef4[o0] : (-3 <= block_y < 0 and block_x <= -4 and -8 + block_x - 4o0 <= 8*floor((-1 + block_x)/8) <= -5 + block_x - 4o0) or (-3 <= block_y < 0 and block_x >= 0 and -3 + block_x - 4o0 <= 8*floor((block_x)/8) <= block_x - 4o0) or (block_y <= -4 and block_x <= -4 and -16 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((-1 + block_y)/4) <= 16*floor((-1 + block_y)/8) <= -13 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((-1 + block_y)/4)) or (block_y <= -4 and block_x >= 0 and -11 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((-1 + block_y)/4) <= 16*floor((-1 + block_y)/8) <= -8 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((-1 + block_y)/4)) or (block_y >= 0 and block_x <= -4 and -8 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((block_y)/4) <= 16*floor((block_y)/8) <= -5 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((block_y)/4)) or (block_y >= 0 and block_x >= 0 and -3 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((block_y)/4) <= 16*floor((block_y)/8) <= block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((block_y)/4)) or (4*floor((block_y)/8) = -o0 + 2*floor((block_y)/4) and block_y >= 0 and -3 <= block_x < 0 and 4*floor((block_y)/4) >= -7 + block_y + 2o0 and 4*floor((block_y)/4) <= block_y + 2o0) or (4*floor((-1 + block_y)/8) = -2 - o0 + 2*floor((-1 + block_y)/4) and block_y <= -4 and -3 <= block_x < 0 and 4*floor((-1 + block_y)/4) >= -4 + block_y + 2o0 and 4*floor((-1 + block_y)/4) <= 3 + block_y + 2o0); Stmt_entry_split[] -> MemRef4[0] : -3 <= block_y < 0 and -3 <= block_x < 0 }; +; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> MemRef0[] }; +; CHECK-NEXT: } + +@img = external global %struct.IP*, align 8 + +; Function Attrs: nounwind uwtable +define void @dct_luma(i32 %block_x, i32 %block_y) #0 { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %div = sdiv i32 %block_x, 4 + %div1 = sdiv i32 %block_y, 4 + %rem = srem i32 %div1, 2 + %mul4 = shl nsw i32 %rem, 1 + %rem5 = srem i32 %div, 2 + %add6 = add nsw i32 %mul4, %rem5 + %idxprom = sext i32 %add6 to i64 + %0 = load %struct.IP*, %struct.IP** @img, align 8 + %cofAC = getelementptr inbounds %struct.IP, %struct.IP* %0, i32 0, i32 0 + %1 = load i32****, i32***** %cofAC, align 8 + %arrayidx = getelementptr inbounds i32***, i32**** %1, i64 0 + %2 = load i32***, i32**** %arrayidx, align 8 + %arrayidx8 = getelementptr inbounds i32**, i32*** %2, i64 %idxprom + %3 = load i32**, i32*** %arrayidx8, align 8 + %mb_data = getelementptr inbounds %struct.IP, %struct.IP* %0, i64 0, i32 4 + %4 = load %struct.m, %struct.m* %mb_data, align 8 + br i1 false, label %land.rhs, label %land.end + +land.rhs: ; preds = %entry.split + br label %land.end + +land.end: ; preds = %land.rhs, %entry.split + %5 = phi i1 [ false, %entry.split ], [ undef, %land.rhs ] + br i1 %5, label %for.cond104.preheader, label %for.cond34.preheader + +for.cond34.preheader: ; preds = %land.end + ret void + +for.cond104.preheader: ; preds = %land.end + ret void +} |