3 files changed, 135 insertions, 4 deletions
diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h
index e30826a9254..30486770eaf 100644
--- a/polly/include/polly/ScopDetection.h
+++ b/polly/include/polly/ScopDetection.h
@@ -287,6 +287,20 @@ private:
   /// @return True if all blocks in R are valid, false otherwise.
   bool allBlocksValid(DetectionContext &Context) const;
 
+  /// @brief Check if a region has sufficient compute instructions
+  ///
+  /// This function checks if a region has a non-trivial number of instructions
+  /// in each loop. This can be used as an indicator if a loop is worth
+  /// optimising.
+  ///
+  /// @param Context  The context of scop detection.
+  /// @param NumLoops The number of loops in the region.
+  ///
+  /// @return True if region is has sufficient compute instructions,
+  ///         false otherwise.
+  bool hasSufficientCompute(DetectionContext &Context,
+                            int NumAffineLoops) const;
+
   /// @brief Check if a region is profitable to optimize.
   ///
   /// Regions that are unlikely to expose interesting optimization opportunities
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index b2e5228d2c6..1083507b744 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -71,6 +71,16 @@ using namespace polly;
 
 #define DEBUG_TYPE "polly-detect"
 
+// This option is set to a very high value, as analyzing such loops increases
+// compile time on several cases. For experiments that enable this option,
+// a value of around 40 has been working to avoid run-time regressions with
+// Polly while still exposing interesting optimization opportunities.
+static cl::opt<int> ProfitabilityMinPerLoopInstructions(
+    "polly-detect-profitability-min-per-loop-insts",
+    cl::desc("The minimal number of per-loop instructions before a single loop "
+             "region is considered profitable"),
+    cl::Hidden, cl::ValueRequired, cl::init(100000000), cl::cat(PollyCategory));
+
 bool polly::PollyProcessUnprofitable;
 static cl::opt<bool, true> XPollyProcessUnprofitable(
     "polly-process-unprofitable",
@@ -1134,6 +1144,19 @@ bool ScopDetection::allBlocksValid(DetectionContext &Context) const {
   return true;
 }
 
+bool ScopDetection::hasSufficientCompute(DetectionContext &Context,
+                                         int NumLoops) const {
+  int InstCount = 0;
+
+  for (auto *BB : Context.CurRegion.blocks())
+    if (Context.CurRegion.contains(LI->getLoopFor(BB)))
+      InstCount += std::distance(BB->begin(), BB->end());
+
+  InstCount = InstCount / NumLoops;
+
+  return InstCount >= ProfitabilityMinPerLoopInstructions;
+}
+
 bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
   Region &CurRegion = Context.CurRegion;
 
@@ -1145,13 +1168,24 @@ bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
   if (!Context.hasStores || !Context.hasLoads)
     return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 
-  // Check if there are sufficent non-overapproximated loops.
   int NumLoops = countBeneficialLoops(&CurRegion);
   int NumAffineLoops = NumLoops - Context.BoxedLoopsSet.size();
-  if (NumAffineLoops < 2)
-    return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 
-  return true;
+  // Scops with at least two loops may allow either loop fusion or tiling and
+  // are consequently interesting to look at.
+  if (NumAffineLoops >= 2)
+    return true;
+
+  // Scops that contain a loop with a non-trivial amount of computation per
+  // loop-iteration are interesting as we may be able to parallelize such
+  // loops. Individual loops that have only a small amount of computation
+  // per-iteration are performance-wise very fragile as any change to the
+  // loop induction variables may affect performance. To not cause spurious
+  // performance regressions, we do not consider such loops.
+  if (NumAffineLoops == 1 && hasSufficientCompute(Context, NumLoops))
+    return true;
+
+  return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 }
 
 bool ScopDetection::isValidRegion(DetectionContext &Context) const {
diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
new file mode 100644
index 00000000000..268731e294d
--- /dev/null
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@@ -0,0 +1,83 @@
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=true \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=PROFITABLE
+
+; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN: \
+; RUN: -polly-detect -analyze < %s | FileCheck %s -check-prefix=UNPROFITABLE
+
+; UNPROFITABLE-NOT: Valid Region for Scop:
+; PROFITABLE: Valid Region for Scop:
+
+;    void foo(float *A, float *B, long N) {
+;      for (long i = 0; i < 100; i++)
+;          A[i] += .... / * a  lot of compute */
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B, i64 %N) {
+entry:
+  br label %header
+
+header:
+  %i.0 = phi i64 [ 0, %entry ], [ %tmp10, %header ]
+  %tmp5 = sitofp i64 %i.0 to float
+  %tmp6 = getelementptr inbounds float, float* %A, i64 %i.0
+  %tmp7 = load float, float* %tmp6, align 4
+  %tmp8 = fadd float %tmp7, %tmp5
+  %val0 = fadd float %tmp7, 1.0
+  %val1 = fadd float %val0, 1.0
+  %val2 = fadd float %val1, 1.0
+  %val3 = fadd float %val2, 1.0
+  %val4 = fadd float %val3, 1.0
+  %val5 = fadd float %val4, 1.0
+  %val6 = fadd float %val5, 1.0
+  %val7 = fadd float %val6, 1.0
+  %val8 = fadd float %val7, 1.0
+  %val9 = fadd float %val8, 1.0
+  %val10 = fadd float %val9, 1.0
+  %val11 = fadd float %val10, 1.0
+  %val12 = fadd float %val11, 1.0
+  %val13 = fadd float %val12, 1.0
+  %val14 = fadd float %val13, 1.0
+  %val15 = fadd float %val14, 1.0
+  %val16 = fadd float %val15, 1.0
+  %val17 = fadd float %val16, 1.0
+  %val18 = fadd float %val17, 1.0
+  %val19 = fadd float %val18, 1.0
+  %val20 = fadd float %val19, 1.0
+  %val21 = fadd float %val20, 1.0
+  %val22 = fadd float %val21, 1.0
+  %val23 = fadd float %val22, 1.0
+  %val24 = fadd float %val23, 1.0
+  %val25 = fadd float %val24, 1.0
+  %val26 = fadd float %val25, 1.0
+  %val27 = fadd float %val26, 1.0
+  %val28 = fadd float %val27, 1.0
+  %val29 = fadd float %val28, 1.0
+  %val30 = fadd float %val29, 1.0
+  %val31 = fadd float %val30, 1.0
+  %val32 = fadd float %val31, 1.0
+  %val33 = fadd float %val32, 1.0
+  %val34 = fadd float %val33, 1.0
+  %val35 = fadd float %val34, 1.0
+  %val36 = fadd float %val35, 1.0
+  %val37 = fadd float %val36, 1.0
+  %val38 = fadd float %val37, 1.0
+  %val39 = fadd float %val38, 1.0
+  %val40 = fadd float %val39, 1.0
+  %val41 = fadd float %val40, 1.0
+  %val42 = fadd float %val41, 1.0
+  %val43 = fadd float %val42, 1.0
+  store float %val34, float* %tmp6, align 4
+  %exitcond = icmp ne i64 %i.0, 100
+  %tmp10 = add nsw i64 %i.0, 1
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}