[PM] Fix a bug where through CGSCC iteration we can get

infinite-inlining across multiple runs of the inliner by keeping a tiny history of internal-to-SCC inlining decisions. This is still a bit gross, but I don't yet have any fundamentally better ideas and numerous people are blocked on this to use new PM and ThinLTO together. The core of the idea is to detect when we are about to do an inline that has a chance of re-splitting an SCC which we have split before with a similar inlining step. That is a critical component in the inlining forming a cycle and so far detects all of the various cyclic patterns I can come up with as well as the original real-world test case (which comes from a ThinLTO build of libunwind). I've added some tests that I think really demonstrate what is going on here. They are essentially state machines that march the inliner through various steps of a cycle and check that we stop when the cycle is closed and that we actually did do inlining to form that cycle. A lot of thanks go to Eric Christopher and Sanjoy Das for the help understanding this issue and improving the test cases. The biggest "yuck" here is the layering issue -- the CGSCC pass manager is providing somewhat magical state to the inliner for it to use to make itself converge. This isn't great, but I don't honestly have a lot of better ideas yet and at least seems nicely isolated. I have tested this patch, and it doesn't block *any* inlining on the entire LLVM test suite and SPEC, so it seems sufficiently narrowly targeted to the issue at hand. We have come up with hypothetical issues that this patch doesn't cover, but so far none of them are practical and we don't have a viable solution yet that covers the hypothetical stuff, so proceeding here in the interim. Definitely an area that we will be back and revisiting in the future. Differential Revision: https://reviews.llvm.org/D36188 llvm-svn: 309784
author: Chandler Carruth <chandlerc@gmail.com> 2017-08-02 02:09:22 +0000
committer: Chandler Carruth <chandlerc@gmail.com> 2017-08-02 02:09:22 +0000
commit: 95055d8f8b0757cec2c9078e6eca982d7b0f997c (patch)
tree: 993392fde9872877bb051bbefdb5aadd59b4ba98 /llvm/test/Transforms/Inline
parent: 8e8f8f43b043b1839973fcc28694ca8d220a2137 (diff)
download: bcm5719-llvm-95055d8f8b0757cec2c9078e6eca982d7b0f997c.tar.gz
bcm5719-llvm-95055d8f8b0757cec2c9078e6eca982d7b0f997c.zip
1 files changed, 125 insertions, 0 deletions
diff --git a/llvm/test/Transforms/Inline/cgscc-cycle.ll b/llvm/test/Transforms/Inline/cgscc-cycle.ll
new file mode 100644
index 00000000000..69874c3ef2f
--- /dev/null
+++ b/llvm/test/Transforms/Inline/cgscc-cycle.ll
@@ -0,0 +1,125 @@
+; This test contains extremely tricky call graph structures for the inliner to
+; handle correctly. They form cycles where the inliner introduces code that is
+; immediately or can eventually be transformed back into the original code. And
+; each step changes the call graph and so will trigger iteration. This requires
+; some out-of-band way to prevent infinitely re-inlining and re-transforming the
+; code.
+;
+; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s
+
+
+; The `test1_*` collection of functions form a directly cycling pattern.
+
+define void @test1_a(i8** %ptr) {
+; CHECK-LABEL: define void @test1_a(
+entry:
+  call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; over and over again. However, each inlining increments the count, and so we
+; expect this test case to stop after one round of inlining with a final
+; argument of '1'.
+; CHECK-NOT:     call
+; CHECK:         call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 1)
+; CHECK-NOT:     call
+
+  ret void
+}
+
+define void @test1_b(i8* %arg, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test1_b(
+entry:
+  %a = alloca i8*
+  store i8* %arg, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test1_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %cast = bitcast i8** %a to void (i8*, i1, i32)**
+  %p = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %cast
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %p(i8* %arg, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
+
+define void @test2_a(i8** %ptr) {
+; CHECK-LABEL: define void @test2_a(
+entry:
+  call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 0)
+; Inlining and simplifying this call will reliably produce the exact same call,
+; but only after doing two rounds if inlining, first from @test2_b then
+; @test2_c. We check the exact number of inlining rounds before we cut off to
+; break the cycle by inspecting the last paramater that gets incremented with
+; each inlined function body.
+; CHECK-NOT:     call
+; CHECK:         call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 2)
+; CHECK-NOT:     call
+  ret void
+}
+
+define void @test2_b(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_b(
+entry:
+  %a = alloca i8*
+  store i8* %arg2, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg2, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test2_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %p = load i8*, i8** %a
+  %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
+
+define void @test2_c(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) {
+; CHECK-LABEL: define void @test2_c(
+entry:
+  %a = alloca i8*
+  store i8* %arg1, i8** %a
+; This alloca and store should remain through any optimization.
+; CHECK:         %[[A:.*]] = alloca
+; CHECK:         store i8* %arg1, i8** %[[A]]
+
+  br i1 %flag, label %bb1, label %bb2
+
+bb1:
+  call void @test2_a(i8** %a) noinline
+  br label %bb2
+
+bb2:
+  %p = load i8*, i8** %a
+  %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)*
+  %inline_count_inc = add i32 %inline_count, 1
+  call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc)
+; And we should continue to load and call indirectly through optimization.
+; CHECK:         %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)**
+; CHECK:         %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]]
+; CHECK:         call void %[[P]](
+
+  ret void
+}
author	Chandler Carruth <chandlerc@gmail.com>	2017-08-02 02:09:22 +0000
committer	Chandler Carruth <chandlerc@gmail.com>	2017-08-02 02:09:22 +0000
commit	95055d8f8b0757cec2c9078e6eca982d7b0f997c (patch)
tree	993392fde9872877bb051bbefdb5aadd59b4ba98 /llvm/test/Transforms/Inline
parent	8e8f8f43b043b1839973fcc28694ca8d220a2137 (diff)
download	bcm5719-llvm-95055d8f8b0757cec2c9078e6eca982d7b0f997c.tar.gz bcm5719-llvm-95055d8f8b0757cec2c9078e6eca982d7b0f997c.zip