diff options
| -rw-r--r-- | llvm/lib/Transforms/IPO/Inliner.cpp | 91 | ||||
| -rw-r--r-- | llvm/test/Other/new-pm-defaults.ll | 4 | ||||
| -rw-r--r-- | llvm/test/Other/new-pm-lto-defaults.ll | 2 | ||||
| -rw-r--r-- | llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll | 46 | ||||
| -rw-r--r-- | llvm/test/Transforms/Inline/cgscc-invalidate.ll | 32 |
5 files changed, 104 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 7049a6f501c..5ea26a86eb8 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -746,20 +746,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, Module &M = *InitialC.begin()->getFunction().getParent(); ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M); - // We use a worklist of nodes to process so that we can handle if the SCC - // structure changes and some nodes are no longer part of the current SCC. We - // also need to use an updatable pointer for the SCC as a consequence. - SmallVector<LazyCallGraph::Node *, 16> Nodes; - for (auto &N : InitialC) - Nodes.push_back(&N); + // We use a single common worklist for calls across the entire SCC. We + // process these in-order and append new calls introduced during inlining to + // the end. + // + // Note that this particular order of processing is actually critical to + // avoid very bad behaviors. Consider *highly connected* call graphs where + // each function contains a small amonut of code and a couple of calls to + // other functions. Because the LLVM inliner is fundamentally a bottom-up + // inliner, it can handle gracefully the fact that these all appear to be + // reasonable inlining candidates as it will flatten things until they become + // too big to inline, and then move on and flatten another batch. + // + // However, when processing call edges *within* an SCC we cannot rely on this + // bottom-up behavior. As a consequence, with heavily connected *SCCs* of + // functions we can end up incrementally inlining N calls into each of + // N functions because each incremental inlining decision looks good and we + // don't have a topological ordering to prevent explosions. + // + // To compensate for this, we don't process transitive edges made immediate + // by inlining until we've done one pass of inlining across the entire SCC. + // Large, highly connected SCCs still lead to some amount of code bloat in + // this model, but it is uniformly spread across all the functions in the SCC + // and eventually they all become too large to inline, rather than + // incrementally maknig a single function grow in a super linear fashion. + SmallVector<std::pair<CallSite, int>, 16> Calls; + + // Populate the initial list of calls in this SCC. + for (auto &N : InitialC) { + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(N.getFunction())) + if (auto CS = CallSite(&I)) + if (Function *Callee = CS.getCalledFunction()) + if (!Callee->isDeclaration()) + Calls.push_back({CS, -1}); + } + if (Calls.empty()) + return PreservedAnalyses::all(); + + // Capture updatable variables for the current SCC and RefSCC. auto *C = &InitialC; auto *RC = &C->getOuterRefSCC(); - // We also use a secondary worklist of call sites within a particular node to - // allow quickly continuing to inline through newly inlined call sites where - // possible. - SmallVector<std::pair<CallSite, int>, 16> Calls; - // When inlining a callee produces new call sites, we want to keep track of // the fact that they were inlined from the callee. This allows us to avoid // infinite inlining in some obscure cases. To represent this, we use an @@ -775,11 +807,17 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // defer deleting these to make it easier to handle the call graph updates. SmallVector<Function *, 4> DeadFunctions; - do { - auto &N = *Nodes.pop_back_val(); + // Loop forward over all of the calls. Note that we cannot cache the size as + // inlining can introduce new calls that need to be processed. + for (int i = 0; i < (int)Calls.size(); ++i) { + // We expect the calls to typically be batched with sequences of calls that + // have the same caller, so we first set up some shared infrastructure for + // this caller. We also do any pruning we can at this layer on the caller + // alone. + Function &F = *Calls[i].first.getCaller(); + LazyCallGraph::Node &N = *CG.lookup(F); if (CG.lookupSCC(N) != C) continue; - Function &F = N.getFunction(); if (F.hasFnAttribute(Attribute::OptimizeNone)) continue; @@ -813,23 +851,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // Get the remarks emission analysis for the caller. auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - // We want to generally process call sites top-down in order for - // simplifications stemming from replacing the call with the returned value - // after inlining to be visible to subsequent inlining decisions. So we - // walk the function backwards and then process the back of the vector. - // FIXME: Using reverse is a really bad way to do this. Instead we should - // do an actual PO walk of the function body. - for (Instruction &I : reverse(instructions(F))) - if (auto CS = CallSite(&I)) - if (Function *Callee = CS.getCalledFunction()) - if (!Callee->isDeclaration()) - Calls.push_back({CS, -1}); - + // Now process as many calls as we have within this caller in the sequnece. + // We bail out as soon as the caller has to change so we can update the + // call graph and prepare the context of that new caller. bool DidInline = false; - while (!Calls.empty()) { + for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) { int InlineHistoryID; CallSite CS; - std::tie(CS, InlineHistoryID) = Calls.pop_back_val(); + std::tie(CS, InlineHistoryID) = Calls[i]; Function &Callee = *CS.getCalledFunction(); if (InlineHistoryID != -1 && @@ -886,6 +915,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, } } + // Back the call index up by one to put us in a good position to go around + // the outer loop. + --i; + if (!DidInline) continue; Changed = true; @@ -914,7 +947,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); RC = &C->getOuterRefSCC(); - } while (!Nodes.empty()); + } // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 5a882138003..7657f184b28 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -67,9 +67,8 @@ ; CHECK-O-NEXT: Starting CGSCC pass manager run. ; CHECK-O-NEXT: Running pass: InlinerPass ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> -; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy -; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass +; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting llvm::Function pass manager run. @@ -88,6 +87,7 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: ReassociatePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis +; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> ; CHECK-O-NEXT: Running analysis: LoopAnalysis ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index e86bcdddcda..dfd29835327 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -47,7 +47,6 @@ ; CHECK-O2-NEXT: Running pass: DeadArgumentEliminationPass ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}InstCombinePass> ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass> -; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: GlobalOptPass ; CHECK-O2-NEXT: Running pass: GlobalDCEPass ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> @@ -60,6 +59,7 @@ ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}PostOrderFunctionAttrsPass> ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis ; CHECK-O2-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O2-NEXT: Running pass: CrossDSOCFIPass diff --git a/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll index 7c6d0fc04ce..82d321ccf22 100644 --- a/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll +++ b/llvm/test/Transforms/Inline/cgscc-incremental-invalidate.ll @@ -7,19 +7,19 @@ ; may stop testing anything. ; ; CHECK-LABEL: Starting llvm::Module pass manager run. -; CHECK: Running pass: InlinerPass on (test1_h, test1_g, test1_f) -; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_h, test1_g, test1_f) +; CHECK: Running pass: InlinerPass on (test1_f, test1_g, test1_h) +; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h) ; CHECK: Running analysis: DominatorTreeAnalysis on test1_f ; CHECK: Running analysis: DominatorTreeAnalysis on test1_g -; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g, test1_f) -; CHECK: Invalidating all non-preserved analyses for: test1_h -; CHECK-NOT: Invalidating anaylsis: -; CHECK: Invalidating all non-preserved analyses for: test1_g -; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g +; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h) ; CHECK: Invalidating all non-preserved analyses for: test1_f ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f +; CHECK: Invalidating all non-preserved analyses for: test1_g +; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g +; CHECK: Invalidating all non-preserved analyses for: test1_h +; CHECK-NOT: Invalidating anaylsis: ; CHECK: Running analysis: DominatorTreeAnalysis on test1_h -; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g) +; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h) ; CHECK: Invalidating all non-preserved analyses for: test1_h ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h @@ -51,14 +51,14 @@ return: ; reducing an SCC in the inliner cannot accidentially leave stale function ; analysis results due to failing to invalidate them for all the functions. -; We visit this function first in the inliner, and while we inline callee -; perturbing the CFG, we don't inline anything else and the SCC structure -; remains in tact. -define void @test1_f() { -; CHECK-LABEL: define void @test1_f() +; The inliner visits this last function. It can't actually break any cycles +; here, but because we visit this function we compute fresh analyses for it. +; These analyses are then invalidated when we inline callee disrupting the +; CFG, and it is important that they be freed. +define void @test1_h() { +; CHECK-LABEL: define void @test1_h() entry: - ; We force this edge to survive inlining. - call void @test1_g() noinline + call void @test1_g() ; CHECK: call void @test1_g() ; Pull interesting CFG into this function. @@ -69,7 +69,7 @@ entry: ; CHECK: ret void } -; Next we visit this function and here we inline the edge to 'test1_f' +; We visit this function second and here we inline the edge to 'test1_f' ; separating it into its own SCC. The current SCC is now just 'test1_g' and ; 'test1_h'. define void @test1_g() { @@ -92,14 +92,14 @@ entry: ; CHECK: ret void } -; Finally the inliner visits this last function. It can't actually break any -; cycles here, but because we visit this function we compute fresh analyses for -; it. These analyses are then invalidated when we inline callee disrupting the -; CFG, and it is important that they be freed. -define void @test1_h() { -; CHECK-LABEL: define void @test1_h() +; We visit this function first in the inliner, and while we inline callee +; perturbing the CFG, we don't inline anything else and the SCC structure +; remains in tact. +define void @test1_f() { +; CHECK-LABEL: define void @test1_f() entry: - call void @test1_g() + ; We force this edge to survive inlining. + call void @test1_g() noinline ; CHECK: call void @test1_g() ; Pull interesting CFG into this function. diff --git a/llvm/test/Transforms/Inline/cgscc-invalidate.ll b/llvm/test/Transforms/Inline/cgscc-invalidate.ll index 60315cda771..69d84f65e25 100644 --- a/llvm/test/Transforms/Inline/cgscc-invalidate.ll +++ b/llvm/test/Transforms/Inline/cgscc-invalidate.ll @@ -65,15 +65,15 @@ entry: ; The 'test3_' prefixed functions test the scenario of not inlining preserving ; dominators after splitting an SCC into two smaller SCCs. -; The first function gets visited first and we end up inlining everything we -; can into this routine. That splits test3_g into a separate SCC that is enqued -; for later processing. -define void @test3_f() { -; CHECK-LABEL: define void @test3_f() +; This function ends up split into a separate SCC, which can cause its analyses +; to become stale if the splitting doesn't properly invalidate things. Also, as +; a consequence of being split out, test3_f is too large to inline by the time +; we get here. +define void @test3_g() { +; CHECK-LABEL: define void @test3_g() entry: - ; Create the first edge in the SCC cycle. - call void @test3_g() -; CHECK-NOT: @test3_g() + ; Create the second edge in the SCC cycle. + call void @test3_f() ; CHECK: call void @test3_f() ; Pull interesting CFG into this function. @@ -84,15 +84,15 @@ entry: ; CHECK: ret void } -; This function ends up split into a separate SCC, which can cause its analyses -; to become stale if the splitting doesn't properly invalidate things. Also, as -; a consequence of being split out, test3_f is too large to inline by the time -; we get here. -define void @test3_g() { -; CHECK-LABEL: define void @test3_g() +; The second function gets visited first and we end up inlining everything we +; can into this routine. That splits test3_g into a separate SCC that is enqued +; for later processing. +define void @test3_f() { +; CHECK-LABEL: define void @test3_f() entry: - ; Create the second edge in the SCC cycle. - call void @test3_f() + ; Create the first edge in the SCC cycle. + call void @test3_g() +; CHECK-NOT: @test3_g() ; CHECK: call void @test3_f() ; Pull interesting CFG into this function. |

