Improve profile-guided heuristics to use estimated trip count.

Summary: Existing heuristic uses the ratio between the function entry frequency and the loop invocation frequency to find cold loops. However, even if the loop executes frequently, if it has a small trip count per each invocation, vectorization is not beneficial. On the other hand, even if the loop invocation frequency is much smaller than the function invocation frequency, if the trip count is high it is still beneficial to vectorize the loop. This patch uses estimated trip count computed from the profile metadata as a primary metric to determine coldness of the loop. If the estimated trip count cannot be computed, it falls back to the original heuristics. Reviewers: Ayal, mssimpso, mkuper, danielcdh, wmi, tejohnson Reviewed By: tejohnson Subscribers: tejohnson, mzolotukhin, llvm-commits Differential Revision: https://reviews.llvm.org/D32451 llvm-svn: 305729
author: Taewook Oh <twoh@fb.com> 2017-06-19 18:48:58 +0000
committer: Taewook Oh <twoh@fb.com> 2017-06-19 18:48:58 +0000
commit: 9083547ae317b7ddf604a5bedac994cb392bb887 (patch)
tree: 80e0fbb6ef216a20c498ed99f511d4033a5585d6 /llvm/test/Transforms
parent: 162b40a85006f76e916225a7433a8194ae13c53b (diff)
download: bcm5719-llvm-9083547ae317b7ddf604a5bedac994cb392bb887.tar.gz
bcm5719-llvm-9083547ae317b7ddf604a5bedac994cb392bb887.zip
2 files changed, 91 insertions, 26 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 47c262b11b4..89d69e232f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -115,32 +115,6 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
   ret void
 }
 
-; N is unknown, we need a tail. Can't vectorize because the loop is cold.
-;CHECK-LABEL: @example4(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
-define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
-  %1 = icmp eq i32 %n, 0
-  br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
-  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
-  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
-  %2 = add nsw i32 %.05, -1
-  %3 = getelementptr inbounds i32, i32* %.023, i64 1
-  %4 = load i32, i32* %.023, align 16
-  %5 = getelementptr inbounds i32, i32* %.014, i64 1
-  store i32 %4, i32* %.014, align 16
-  %6 = icmp eq i32 %2, 0
-  br i1 %6, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret void
-}
-
-!0 = !{!"branch_weights", i32 64, i32 4}
-
 ; We can't vectorize this one because we need a runtime ptr check.
 ;CHECK-LABEL: @example23(
 ;CHECK-NOT: <4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
new file mode 100644
index 00000000000..03b3aa171d4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -0,0 +1,91 @@
+; This test verifies that the loop vectorizer will not vectorizes low trip count
+; loops that require runtime checks (Trip count is computed with profile info).
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_low_trip_count1(i32 %bound) {
+; Simple loop with low tripcount. Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count1(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
+; The loop has a same invocation count with the function, but has a low
+; trip_count per invocation and not worth to vectorize.
+
+; CHECK-LABEL: @foo_low_trip_count2(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
+; The loop has low invocation count compare to the function invocation count, 
+; but has a high trip count per invocation. Vectorize it.
+
+; CHECK-LABEL: @foo_low_trip_count3(
+; CHECK: vector.body:
+
+entry:
+  br i1 %cond, label %for.preheader, label %for.end, !prof !2
+
+for.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !3
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+
+!0 = !{!"function_entry_count", i64 100}
+!1 = !{!"branch_weights", i32 100, i32 0}
+!2 = !{!"branch_weights", i32 10, i32 90}
+!3 = !{!"branch_weights", i32 10, i32 10000}
author	Taewook Oh <twoh@fb.com>	2017-06-19 18:48:58 +0000
committer	Taewook Oh <twoh@fb.com>	2017-06-19 18:48:58 +0000
commit	9083547ae317b7ddf604a5bedac994cb392bb887 (patch)
tree	80e0fbb6ef216a20c498ed99f511d4033a5585d6 /llvm/test/Transforms
parent	162b40a85006f76e916225a7433a8194ae13c53b (diff)
download	bcm5719-llvm-9083547ae317b7ddf604a5bedac994cb392bb887.tar.gz bcm5719-llvm-9083547ae317b7ddf604a5bedac994cb392bb887.zip