Restore "[ThinLTO] Ensure we always select the same function copy to import"

This reverts commit r337081, therefore restoring r337050 (and fix in r337059), with test fix for bot failure described after the original description below. In order to always import the same copy of a linkonce function, even when encountering it with different thresholds (a higher one then a lower one), keep track of the summary we decided to import. This ensures that the backend only gets a single definition to import for each GUID, so that it doesn't need to choose one. Move the largest threshold the GUID was considered for import into the current module out of the ImportMap (which is part of a larger map maintained across the whole index), and into a new map just maintained for the current module we are computing imports for. This saves some memory since we no longer have the thresholds maintained across the whole index (and throughout the in-process backends when doing a normal non-distributed ThinLTO build), at the cost of some additional information being maintained for each invocation of ComputeImportForModule (the selected summary pointer for each import). There is an additional map lookup for each callee being considered for importing, however, this was able to subsume a map lookup in the Worklist iteration that invokes computeImportForFunction. We also are able to avoid calling selectCallee if we already failed to import at the same or higher threshold. I compared the run time and peak memory for the SPEC2006 471.omnetpp benchmark (running in-process ThinLTO backends), as well as for a large internal benchmark with a distributed ThinLTO build (so just looking at the thin link time/memory). Across a number of runs with and without this change there was no significant change in the time and memory. (I tried a few other variations of the change but they also didn't improve time or peak memory). The new commit removes a test that no longer makes sense (Transforms/FunctionImport/hotness_based_import2.ll), as exposed by the reverse-iteration bot. The test depends on the order of processing the summary call edges, and actually depended on the old problematic behavior of selecting more than one summary for a given GUID when encountered with different thresholds. There was no guarantee even before that we would eventually pick the linkonce copy with the hottest call edges, it just happened to work with the test and the old code, and there was no guarantee that we would end up importing the selected version of the copy that had the hottest call edges (since the backend would effectively import only one of the selected copies). Reviewers: davidxl Subscribers: mehdi_amini, inglorion, llvm-commits Differential Revision: https://reviews.llvm.org/D48670 llvm-svn: 337184
author: Teresa Johnson <tejohnson@google.com> 2018-07-16 15:30:27 +0000
committer: Teresa Johnson <tejohnson@google.com> 2018-07-16 15:30:27 +0000
commit: d68935c5ac4da578d82936e4d1dfcd0574ef8366 (patch)
tree: 5aea4ce9e0b918412f7ab5b7bab11303aad2a926 /llvm/test/Transforms
parent: 484aabc81897b724441afc5a030ebb41c9684a72 (diff)
download: bcm5719-llvm-d68935c5ac4da578d82936e4d1dfcd0574ef8366.tar.gz
bcm5719-llvm-d68935c5ac4da578d82936e4d1dfcd0574ef8366.zip
5 files changed, 92 insertions, 95 deletions
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved1.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved1.ll
new file mode 100644
index 00000000000..2b2443c96af
--- /dev/null
+++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved1.ll
@@ -0,0 +1,34 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @foo() {
+  call void @linkonceodrfunc()
+  call void @linkonceodrfunc2()
+  ret void
+}
+
+define linkonce_odr void @linkonceodrfunc() {
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  ret void
+}
+
+define linkonce_odr void @linkonceodrfunc2() {
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  call void @f()
+  ret void
+}
+
+define internal void @f() {
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved2.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved2.ll
new file mode 100644
index 00000000000..278a7f4553f
--- /dev/null
+++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_resolved2.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define linkonce_odr void @linkonceodrfunc() {
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/hotness_based_import2.ll b/llvm/test/Transforms/FunctionImport/Inputs/hotness_based_import2.ll
deleted file mode 100644
index 4f17cb2cd9a..00000000000
--- a/llvm/test/Transforms/FunctionImport/Inputs/hotness_based_import2.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; ModuleID = 'thinlto-function-summary-callgraph-profile-summary2.ll'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-define void @hot() #1 !prof !28  {
-  call void @calledFromHot()
-  ret void
-}
-
-; 9 instructions so it is above decayed cold threshold of 7 and below
-; decayed hot threshold of 10.
-define void @calledFromHot() !prof !28 {
-  %b = alloca i32, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  store i32 1, i32* %b, align 4
-  ret void
-}
-
-!llvm.module.flags = !{!1}
-
-!1 = !{i32 1, !"ProfileSummary", !2}
-!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
-!3 = !{!"ProfileFormat", !"InstrProf"}
-!4 = !{!"TotalCount", i64 222}
-!5 = !{!"MaxCount", i64 110}
-!6 = !{!"MaxInternalCount", i64 1}
-!7 = !{!"MaxFunctionCount", i64 110}
-!8 = !{!"NumCounts", i64 4}
-!9 = !{!"NumFunctions", i64 3}
-!10 = !{!"DetailedSummary", !11}
-!11 = !{!12, !13, !14}
-!12 = !{i32 10000, i64 110, i32 2}
-!13 = !{i32 999000, i64 2, i32 4}
-!14 = !{i32 999999, i64 2, i32 4}
-!28 = !{!"function_entry_count", i64 110}
-!29 = !{!"function_entry_count", i64 1}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport_resolved.ll b/llvm/test/Transforms/FunctionImport/funcimport_resolved.ll
new file mode 100644
index 00000000000..b256a613602
--- /dev/null
+++ b/llvm/test/Transforms/FunctionImport/funcimport_resolved.ll
@@ -0,0 +1,52 @@
+; Test to ensure that we always select the same copy of a linkonce function
+; when it is encountered with different thresholds. When we encounter the
+; copy in funcimport_resolved1.ll with a higher threshold via the direct call
+; from main(), it will be selected for importing. When we encounter it with a
+; lower threshold by reaching it from the deeper call chain via foo(), it
+; won't be selected for importing. We don't want to select both the copy from
+; funcimport_resolved1.ll and the smaller one from funcimport_resolved2.ll,
+; leaving it up to the backend to figure out which one to actually import.
+; The linkonce_odr may have different instruction counts in practice due to
+; different inlines in the compile step.
+
+; Require asserts so we can use -debug-only
+; REQUIRES: asserts
+
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/funcimport_resolved1.ll -o %t2.bc
+; RUN: opt -module-summary %p/Inputs/funcimport_resolved2.ll -o %t3.bc
+
+; First do a sanity check that all callees are imported with the default
+; instruction limit
+; RUN: llvm-lto2 run %t.bc %t2.bc %t3.bc -o %t4 -r=%t.bc,_main,pl -r=%t.bc,_linkonceodrfunc,l -r=%t.bc,_foo,l -r=%t2.bc,_foo,pl -r=%t2.bc,_linkonceodrfunc,pl -r=%t2.bc,_linkonceodrfunc2,pl -r=%t3.bc,_linkonceodrfunc,l -thinlto-threads=1 -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=INSTLIMDEFAULT
+; INSTLIMDEFAULT: Is importing function {{.*}} foo from {{.*}}funcimport_resolved1.ll
+; INSTLIMDEFAULT: Is importing function {{.*}} linkonceodrfunc from {{.*}}funcimport_resolved1.ll
+; INSTLIMDEFAULT: Is importing function {{.*}} linkonceodrfunc2 from {{.*}}funcimport_resolved1.ll
+; INSTLIMDEFAULT: Is importing function {{.*}} f from {{.*}}funcimport_resolved1.ll
+; INSTLIMDEFAULT-NOT: Is importing function {{.*}} linkonceodrfunc from {{.*}}funcimport_resolved2.ll
+
+; Now run with the lower threshold that will only allow linkonceodrfunc to be
+; imported from funcimport_resolved1.ll when encountered via the direct call
+; from main(). Ensure we don't also select the copy in funcimport_resolved2.ll
+; when it is encountered via the deeper call chain.
+; RUN: llvm-lto2 run %t.bc %t2.bc %t3.bc -o %t4 -r=%t.bc,_main,pl -r=%t.bc,_linkonceodrfunc,l -r=%t.bc,_foo,l -r=%t2.bc,_foo,pl -r=%t2.bc,_linkonceodrfunc,pl -r=%t2.bc,_linkonceodrfunc2,pl -r=%t3.bc,_linkonceodrfunc,l -thinlto-threads=1 -debug-only=function-import -import-instr-limit=8 2>&1 | FileCheck %s --check-prefix=INSTLIM8
+; INSTLIM8: Is importing function {{.*}} foo from {{.*}}funcimport_resolved1.ll
+; INSTLIM8: Is importing function {{.*}} linkonceodrfunc from {{.*}}funcimport_resolved1.ll
+; INSTLIM8: Not importing function {{.*}} linkonceodrfunc2 from {{.*}}funcimport_resolved1.ll
+; INSTLIM8: Is importing function {{.*}} f from {{.*}}funcimport_resolved1.ll
+; INSTLIM8-NOT: Is importing function {{.*}} linkonceodrfunc from {{.*}}funcimport_resolved2.ll
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define i32 @main() #0 {
+entry:
+  call void (...) @foo()
+  call void (...) @linkonceodrfunc()
+  ret i32 0
+}
+
+declare void @foo(...) #1
+declare void @linkonceodrfunc(...) #1
diff --git a/llvm/test/Transforms/FunctionImport/hotness_based_import2.ll b/llvm/test/Transforms/FunctionImport/hotness_based_import2.ll
deleted file mode 100644
index 50de33199f6..00000000000
--- a/llvm/test/Transforms/FunctionImport/hotness_based_import2.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; Test to check that callee reached from cold and then hot path gets
-; hot thresholds.
-; RUN: opt -module-summary %s -o %t.bc
-; RUN: opt -module-summary %p/Inputs/hotness_based_import2.ll -o %t2.bc
-; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
-
-; Teset with limit set to 10 and multipliers set to 1. Since cold call to
-; hot is first in the other module, we'll first add calledFromHot to worklist
-; with threshold decayed by default 0.7 factor. Test ensures that when we
-; encounter it again from hot path, we re-enqueue with higher non-decayed
-; threshold which will allow it to be imported.
-; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=10 -import-hot-multiplier=1.0 -import-cold-multiplier=1.0 -S | FileCheck %s --check-prefix=CHECK
-; CHECK-DAG: define available_externally void @hot()
-; CHECK-DAG: define available_externally void @calledFromHot()
-
-; ModuleID = 'thinlto-function-summary-callgraph.ll'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; This function has a high profile count, so entry block is hot.
-define void @hot_function(i1 %a, i1 %a2) !prof !28 {
-entry:
-  call void @hot()
-  ret void
-}
-
-; This function has a low profile count, so entry block is hot.
-define void @cold_function(i1 %a, i1 %a2) !prof !29 {
-entry:
-  call void @hot()
-  ret void
-}
-
-declare void @hot() #1
-
-!llvm.module.flags = !{!1}
-
-!1 = !{i32 1, !"ProfileSummary", !2}
-!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
-!3 = !{!"ProfileFormat", !"InstrProf"}
-!4 = !{!"TotalCount", i64 222}
-!5 = !{!"MaxCount", i64 110}
-!6 = !{!"MaxInternalCount", i64 1}
-!7 = !{!"MaxFunctionCount", i64 110}
-!8 = !{!"NumCounts", i64 4}
-!9 = !{!"NumFunctions", i64 3}
-!10 = !{!"DetailedSummary", !11}
-!11 = !{!12, !13, !14}
-!12 = !{i32 10000, i64 110, i32 2}
-!13 = !{i32 999000, i64 2, i32 4}
-!14 = !{i32 999999, i64 2, i32 4}
-!28 = !{!"function_entry_count", i64 110}
-!29 = !{!"function_entry_count", i64 1}
author	Teresa Johnson <tejohnson@google.com>	2018-07-16 15:30:27 +0000
committer	Teresa Johnson <tejohnson@google.com>	2018-07-16 15:30:27 +0000
commit	d68935c5ac4da578d82936e4d1dfcd0574ef8366 (patch)
tree	5aea4ce9e0b918412f7ab5b7bab11303aad2a926 /llvm/test/Transforms
parent	484aabc81897b724441afc5a030ebb41c9684a72 (diff)
download	bcm5719-llvm-d68935c5ac4da578d82936e4d1dfcd0574ef8366.tar.gz bcm5719-llvm-d68935c5ac4da578d82936e4d1dfcd0574ef8366.zip