5 files changed, 104 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 7049a6f501c..5ea26a86eb8 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -746,20 +746,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   Module &M = *InitialC.begin()->getFunction().getParent();
   ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
 
-  // We use a worklist of nodes to process so that we can handle if the SCC
-  // structure changes and some nodes are no longer part of the current SCC. We
-  // also need to use an updatable pointer for the SCC as a consequence.
-  SmallVector<LazyCallGraph::Node *, 16> Nodes;
-  for (auto &N : InitialC)
-    Nodes.push_back(&N);
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
+  // each function contains a small amonut of code and a couple of calls to
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallSite, int>, 16> Calls;
+
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (!Callee->isDeclaration())
+            Calls.push_back({CS, -1});
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
+
+  // Capture updatable variables for the current SCC and RefSCC.
   auto *C = &InitialC;
   auto *RC = &C->getOuterRefSCC();
 
-  // We also use a secondary worklist of call sites within a particular node to
-  // allow quickly continuing to inline through newly inlined call sites where
-  // possible.
-  SmallVector<std::pair<CallSite, int>, 16> Calls;
-
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
   // infinite inlining in some obscure cases.  To represent this, we use an
@@ -775,11 +807,17 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // defer deleting these to make it easier to handle the call graph updates.
   SmallVector<Function *, 4> DeadFunctions;
 
-  do {
-    auto &N = *Nodes.pop_back_val();
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int i = 0; i < (int)Calls.size(); ++i) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[i].first.getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    Function &F = N.getFunction();
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
 
@@ -813,23 +851,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // Get the remarks emission analysis for the caller.
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    // We want to generally process call sites top-down in order for
-    // simplifications stemming from replacing the call with the returned value
-    // after inlining to be visible to subsequent inlining decisions. So we
-    // walk the function backwards and then process the back of the vector.
-    // FIXME: Using reverse is a really bad way to do this. Instead we should
-    // do an actual PO walk of the function body.
-    for (Instruction &I : reverse(instructions(F)))
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
-          if (!Callee->isDeclaration())
-            Calls.push_back({CS, -1});
-
+    // Now process as many calls as we have within this caller in the sequnece.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    while (!Calls.empty()) {
+    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
       int InlineHistoryID;
       CallSite CS;
-      std::tie(CS, InlineHistoryID) = Calls.pop_back_val();
+      std::tie(CS, InlineHistoryID) = Calls[i];
       Function &Callee = *CS.getCalledFunction();
 
       if (InlineHistoryID != -1 &&
@@ -886,6 +915,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       }
     }
 
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --i;
+
     if (!DidInline)
       continue;
     Changed = true;
@@ -914,7 +947,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
     DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
-  } while (!Nodes.empty());
+  }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
   // all of the trivially dead functions, updating the call graph and the CGSCC
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 5a882138003..7657f184b28 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -67,9 +67,8 @@
 ; CHECK-O-NEXT: Starting CGSCC pass manager run.
 ; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
-; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
-; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
+; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O-NEXT: Starting llvm::Function pass manager run.
@@ -88,6 +87,7 @@
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}>
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis
 ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index e86bcdddcda..dfd29835327 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -47,7 +47,6 @@
 ; CHECK-O2-NEXT: Running pass: DeadArgumentEliminationPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}InstCombinePass>
 ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>
-; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running pass: GlobalOptPass
 ; CHECK-O2-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
@@ -60,6 +59,7 @@
 ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}PostOrderFunctionAttrsPass>
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis
 ; CHECK-O2-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O2-NEXT: Running pass: CrossDSOCFIPass
diff --git a/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll
index 7c6d0fc04ce..82d321ccf22 100644
--- a/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll
+++ b/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll
@@ -7,19 +7,19 @@
 ; may stop testing anything.
 ;
 ; CHECK-LABEL: Starting llvm::Module pass manager run.
-; CHECK: Running pass: InlinerPass on (test1_h, test1_g, test1_f)
-; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_h, test1_g, test1_f)
+; CHECK: Running pass: InlinerPass on (test1_f, test1_g, test1_h)
+; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h)
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_f
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_g
-; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g, test1_f)
-; CHECK: Invalidating all non-preserved analyses for: test1_h
-; CHECK-NOT: Invalidating anaylsis:
-; CHECK: Invalidating all non-preserved analyses for: test1_g
-; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h)
 ; CHECK: Invalidating all non-preserved analyses for: test1_f
 ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f
+; CHECK: Invalidating all non-preserved analyses for: test1_g
+; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: test1_h
+; CHECK-NOT: Invalidating anaylsis:
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_h
-; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g)
+; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h)
 ; CHECK: Invalidating all non-preserved analyses for: test1_h
 ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h
 
@@ -51,14 +51,14 @@ return:
 ; reducing an SCC in the inliner cannot accidentially leave stale function
 ; analysis results due to failing to invalidate them for all the functions.
 
-; We visit this function first in the inliner, and while we inline callee
-; perturbing the CFG, we don't inline anything else and the SCC structure
-; remains in tact.
-define void @test1_f() {
-; CHECK-LABEL: define void @test1_f()
+; The inliner visits this last function. It can't actually break any cycles
+; here, but because we visit this function we compute fresh analyses for it.
+; These analyses are then invalidated when we inline callee disrupting the
+; CFG, and it is important that they be freed.
+define void @test1_h() {
+; CHECK-LABEL: define void @test1_h()
 entry:
-  ; We force this edge to survive inlining.
-  call void @test1_g() noinline
+  call void @test1_g()
 ; CHECK: call void @test1_g()
 
   ; Pull interesting CFG into this function.
@@ -69,7 +69,7 @@ entry:
 ; CHECK: ret void
 }
 
-; Next we visit this function and here we inline the edge to 'test1_f'
+; We visit this function second and here we inline the edge to 'test1_f'
 ; separating it into its own SCC. The current SCC is now just 'test1_g' and
 ; 'test1_h'.
 define void @test1_g() {
@@ -92,14 +92,14 @@ entry:
 ; CHECK: ret void
 }
 
-; Finally the inliner visits this last function. It can't actually break any
-; cycles here, but because we visit this function we compute fresh analyses for
-; it. These analyses are then invalidated when we inline callee disrupting the
-; CFG, and it is important that they be freed.
-define void @test1_h() {
-; CHECK-LABEL: define void @test1_h()
+; We visit this function first in the inliner, and while we inline callee
+; perturbing the CFG, we don't inline anything else and the SCC structure
+; remains in tact.
+define void @test1_f() {
+; CHECK-LABEL: define void @test1_f()
 entry:
-  call void @test1_g()
+  ; We force this edge to survive inlining.
+  call void @test1_g() noinline
 ; CHECK: call void @test1_g()
 
   ; Pull interesting CFG into this function.
diff --git a/llvm/test/Transforms/Inline/cgscc-invalidate.ll b/llvm/test/Transforms/Inline/cgscc-invalidate.ll
index 60315cda771..69d84f65e25 100644
--- a/llvm/test/Transforms/Inline/cgscc-invalidate.ll
+++ b/llvm/test/Transforms/Inline/cgscc-invalidate.ll
@@ -65,15 +65,15 @@ entry:
 ; The 'test3_' prefixed functions test the scenario of not inlining preserving
 ; dominators after splitting an SCC into two smaller SCCs.
 
-; The first function gets visited first and we end up inlining everything we
-; can into this routine. That splits test3_g into a separate SCC that is enqued
-; for later processing.
-define void @test3_f() {
-; CHECK-LABEL: define void @test3_f()
+; This function ends up split into a separate SCC, which can cause its analyses
+; to become stale if the splitting doesn't properly invalidate things. Also, as
+; a consequence of being split out, test3_f is too large to inline by the time
+; we get here.
+define void @test3_g() {
+; CHECK-LABEL: define void @test3_g()
 entry:
-  ; Create the first edge in the SCC cycle.
-  call void @test3_g()
-; CHECK-NOT: @test3_g()
+  ; Create the second edge in the SCC cycle.
+  call void @test3_f()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.
@@ -84,15 +84,15 @@ entry:
 ; CHECK: ret void
 }
 
-; This function ends up split into a separate SCC, which can cause its analyses
-; to become stale if the splitting doesn't properly invalidate things. Also, as
-; a consequence of being split out, test3_f is too large to inline by the time
-; we get here.
-define void @test3_g() {
-; CHECK-LABEL: define void @test3_g()
+; The second function gets visited first and we end up inlining everything we
+; can into this routine. That splits test3_g into a separate SCC that is enqued
+; for later processing.
+define void @test3_f() {
+; CHECK-LABEL: define void @test3_f()
 entry:
-  ; Create the second edge in the SCC cycle.
-  call void @test3_f()
+  ; Create the first edge in the SCC cycle.
+  call void @test3_g()
+; CHECK-NOT: @test3_g()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.