Verify profile data confirms large loop trip counts.

Summary: Loops with inequality comparers, such as: // unsigned bound for (unsigned i = 1; i < bound; ++i) {...} have getSmallConstantMaxTripCount report a large maximum static trip count - in this case, 0xffff fffe. However, profiling info may show that the trip count is much smaller, and thus counter-recommend vectorization. This change: - flips loop-vectorize-with-block-frequency on by default. - validates profiled loop frequency data supports vectorization, when static info appears to not counter-recommend it. Absence of profile data means we rely on static data, just as we've done so far. Reviewers: twoh, mkuper, davidxl, tejohnson, Ayal Reviewed By: davidxl Subscribers: bkramer, llvm-commits Differential Revision: https://reviews.llvm.org/D42946 llvm-svn: 324543
author: Mircea Trofin <mtrofin@google.com> 2018-02-07 23:29:52 +0000
committer: Mircea Trofin <mtrofin@google.com> 2018-02-07 23:29:52 +0000
commit: 06ac8cfbd103b2a024d6c1c01ae8912ebce523a0 (patch)
tree: 3eb50f8e1067ec2f2c10261508820f598a0ea5ca
parent: 8e6107a0e49940646d08e5d703d2d128000bcdea (diff)
download: bcm5719-llvm-06ac8cfbd103b2a024d6c1c01ae8912ebce523a0.tar.gz
bcm5719-llvm-06ac8cfbd103b2a024d6c1c01ae8912ebce523a0.zip
2 files changed, 141 insertions, 5 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5ec20fda85..973617cc0c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -208,7 +208,7 @@ static cl::opt<unsigned> SmallLoopCost(
         "The cost of a loop that is considered 'small' by the interleaver."));
 
 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
-    "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
     cl::desc("Enable the use of the block frequency analysis to access PGO "
              "heuristics minimizing code growth in cold regions and being more "
              "aggressive in hot regions."));
@@ -8347,9 +8347,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
-  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
-  bool HasExpectedTC = (ExpectedTC > 0);
-
+  // Prefer constant trip counts over profile data, over upper bound estimate.
+  unsigned ExpectedTC = 0;
+  bool HasExpectedTC = false;
+  if (const SCEVConstant *ConstExits =
+      dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
+    const APInt &ExitsCount = ConstExits->getAPInt();
+    // We are interested in small values for ExpectedTC. Skip over those that
+    // can't fit an unsigned.
+    if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
+      ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
+      HasExpectedTC = true;
+    }
+  }
+  // ExpectedTC may be large because it's bound by a variable. Check
+  // profiling information to validate we should vectorize.
   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
     auto EstimatedTC = getLoopEstimatedTripCount(L);
     if (EstimatedTC) {
@@ -8357,6 +8369,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       HasExpectedTC = true;
     }
   }
+  if (!HasExpectedTC) {
+    ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+    HasExpectedTC = (ExpectedTC > 0);
+  }
 
   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
index 03b3aa171d4..56f8b3e83c7 100644
--- a/llvm/test/Transforms/LoopVectorize/tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -57,7 +57,7 @@ for.end:                                          ; preds = %for.body
 }
 
 define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
-; The loop has low invocation count compare to the function invocation count, 
+; The loop has low invocation count compare to the function invocation count,
 ; but has a high trip count per invocation. Vectorize it.
 
 ; CHECK-LABEL: @foo_low_trip_count3(
@@ -84,6 +84,126 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
+; Simple loop with low tripcount and inequality test for exit.
+; Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp sgt i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_low_trip_count() {
+; Simple loop with constant, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 2
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_large_trip_count() {
+; Simple loop with constant large trip count and no profiling info.
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_small_trip_count_step() {
+; Simple loop with static, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 5
+  %exitcond = icmp slt i32 %i.08, 10
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_trip_over_profile() {
+; constant trip count takes precedence over profile data
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 1000
+  br i1 %exitcond, label %for.body, label %for.end, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
 
 !0 = !{!"function_entry_count", i64 100}
 !1 = !{!"branch_weights", i32 100, i32 0}
author	Mircea Trofin <mtrofin@google.com>	2018-02-07 23:29:52 +0000
committer	Mircea Trofin <mtrofin@google.com>	2018-02-07 23:29:52 +0000
commit	06ac8cfbd103b2a024d6c1c01ae8912ebce523a0 (patch)
tree	3eb50f8e1067ec2f2c10261508820f598a0ea5ca
parent	8e6107a0e49940646d08e5d703d2d128000bcdea (diff)
download	bcm5719-llvm-06ac8cfbd103b2a024d6c1c01ae8912ebce523a0.tar.gz bcm5719-llvm-06ac8cfbd103b2a024d6c1c01ae8912ebce523a0.zip