diff options
author | Vedant Kumar <vsk@apple.com> | 2018-10-24 22:15:41 +0000 |
---|---|---|
committer | Vedant Kumar <vsk@apple.com> | 2018-10-24 22:15:41 +0000 |
commit | c29900687983824ca66fbd83491390873523339e (patch) | |
tree | f84721b45033a99a5e70d7186d53471788b318ae /llvm/lib/Transforms | |
parent | 444363e931d427719ddaa91819a9b133b8ac70e8 (diff) | |
download | bcm5719-llvm-c29900687983824ca66fbd83491390873523339e.tar.gz bcm5719-llvm-c29900687983824ca66fbd83491390873523339e.zip |
[HotColdSplitting] Identify larger cold regions using domtree queries
The current splitting algorithm works in three stages:
1) Identify cold blocks, then
2) Use forward/backward propagation to mark hot blocks, then
3) Grow a SESE region of blocks *outside* of the set of hot blocks and
start outlining.
While testing this pass on Apple internal frameworks I noticed that some
kinds of control flow (e.g. loops) are never outlined, even though they
unconditionally lead to / follow cold blocks. I noticed two other issues
related to how cold regions are identified:
- An inconsistency can arise in the internal state of the hotness
propagation stage, as a block may end up in both the ColdBlocks set
and the HotBlocks set. Further inconsistencies can arise as these sets
do not match what's in ProfileSummaryInfo.
- It isn't necessary to limit outlining to single-exit regions.
This patch teaches the splitting algorithm to identify maximal cold
regions and outline them. A maximal cold region is defined as the set of
blocks post-dominated by a cold sink block, or dominated by that sink
block. This approach can successfully outline loops in the cold path. As
a side benefit, it maintains less internal state than the current
approach.
Due to a limitation in CodeExtractor, blocks within the maximal cold
region which aren't dominated by a single entry point (a so-called "max
ancestor") are filtered out.
Results:
- X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to
47KB pre-patch, or a ~3x improvement. Did not see a performance impact
across two runs.
- AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB
of TEXT were outlined. Ditto re: performance impact.
- Outlining results improve marginally in the internal frameworks I
tested.
Follow-ups:
- Outline more than once per function, outline large single basic
blocks, & try to remove unconditional branches in outlined functions.
Differential Revision: https://reviews.llvm.org/D53627
llvm-svn: 345209
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/IPO/HotColdSplitting.cpp | 353 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/CodeExtractor.cpp | 40 |
2 files changed, 192 insertions, 201 deletions
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index a63cd842241..4f371a494e9 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -57,10 +57,8 @@ #define DEBUG_TYPE "hotcoldsplit" -STATISTIC(NumColdSESEFound, - "Number of cold single entry single exit (SESE) regions found."); -STATISTIC(NumColdSESEOutlined, - "Number of cold single entry single exit (SESE) regions outlined."); +STATISTIC(NumColdRegionsFound, "Number of cold regions found."); +STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined."); using namespace llvm; @@ -74,32 +72,10 @@ struct PostDomTree : PostDomTreeBase<BasicBlock> { PostDomTree(Function &F) { recalculate(F); } }; -typedef DenseSet<const BasicBlock *> DenseSetBB; -typedef DenseMap<const BasicBlock *, uint64_t> DenseMapBBInt; - -// From: https://reviews.llvm.org/D22558 -// Exit is not part of the region. -static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit, - DominatorTree *DT, PostDomTree *PDT, - SmallVectorImpl<BasicBlock *> &Region) { - if (!DT->dominates(Entry, Exit)) - return false; - - if (!PDT->dominates(Exit, Entry)) - return false; - - for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) { - if (*I == Exit) { - I.skipChildren(); - continue; - } - if (!DT->dominates(Entry, *I)) - return false; - Region.push_back(*I); - ++I; - } - return true; -} +/// A sequence of basic blocks. +/// +/// A 0-sized SmallVector is slightly cheaper to move than a std::vector. +using BlockSequence = SmallVector<BasicBlock *, 0>; // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify // this function unless you modify the MBB version as well. @@ -149,105 +125,155 @@ static bool unlikelyExecuted(const BasicBlock &BB) { return false; } -static bool returnsOrHasSideEffects(const BasicBlock &BB) { - const Instruction *I = BB.getTerminator(); - if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I) || isa<InvokeInst>(I)) - return true; +/// Check whether it's safe to outline \p BB. +static bool mayExtractBlock(const BasicBlock &BB) { + return !BB.hasAddressTaken(); +} - for (const Instruction &I : BB) - if (const CallInst *CI = dyn_cast<CallInst>(&I)) { - if (CI->hasFnAttr(Attribute::NoReturn)) - return true; +/// Identify the maximal region of cold blocks which includes \p SinkBB. +/// +/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all +/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which +/// cannot be outlined. +/// +/// Return an empty sequence if the cold region is too small to outline, or if +/// the cold region has no warm predecessors. +static BlockSequence +findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) { + // The maximal cold region. + BlockSequence ColdRegion = {}; + + // The ancestor farthest-away from SinkBB, and also post-dominated by it. + BasicBlock *MaxAncestor = &SinkBB; + unsigned MaxAncestorHeight = 0; + + // Visit SinkBB's ancestors using inverse DFS. + auto PredIt = ++idf_begin(&SinkBB); + auto PredEnd = idf_end(&SinkBB); + while (PredIt != PredEnd) { + BasicBlock &PredBB = **PredIt; + bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); + + // If SinkBB does not post-dominate a predecessor, do not mark the + // predecessor (or any of its predecessors) cold. + if (!SinkPostDom || !mayExtractBlock(PredBB)) { + PredIt.skipChildren(); + continue; + } - if (isa<InlineAsm>(CI->getCalledValue())) - return true; + // Keep track of the post-dominated ancestor farthest away from the sink. + unsigned AncestorHeight = PredIt.getPathLength(); + if (AncestorHeight > MaxAncestorHeight) { + MaxAncestor = &PredBB; + MaxAncestorHeight = AncestorHeight; } - return false; -} + ColdRegion.push_back(&PredBB); + ++PredIt; + } -static DenseSetBB getHotBlocks(Function &F) { + // CodeExtractor requires that all blocks to be extracted must be dominated + // by the first block to be extracted. + // + // To avoid spurious or repeated outlining, require that the max ancestor + // has a predecessor. By construction this predecessor is not in the cold + // region, i.e. its existence implies we don't outline the whole function. + // + // TODO: If MaxAncestor has no predecessors, we may be able to outline the + // second largest cold region that has a predecessor. + if (pred_empty(MaxAncestor) || + MaxAncestor->getSinglePredecessor() == MaxAncestor) + return {}; + + // Filter out predecessors not dominated by the max ancestor. + // + // TODO: Blocks not dominated by the max ancestor could be extracted as + // other cold regions. Marking outlined calls as noreturn when appropriate + // and outlining more than once per function could achieve most of the win. + auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) { + return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB); + }); + ColdRegion.erase(EraseIt, ColdRegion.end()); - // Mark all cold basic blocks. - DenseSetBB ColdBlocks; - for (BasicBlock &BB : F) - if (unlikelyExecuted(BB)) { - LLVM_DEBUG(llvm::dbgs() << "\nForward propagation marks cold: " << BB); - ColdBlocks.insert((const BasicBlock *)&BB); - } + // Add SinkBB to the cold region. + ColdRegion.push_back(&SinkBB); - // Forward propagation: basic blocks are hot when they are reachable from the - // beginning of the function through a path that does not contain cold blocks. - SmallVector<const BasicBlock *, 8> WL; - DenseSetBB HotBlocks; - - const BasicBlock *It = &F.front(); - if (!ColdBlocks.count(It)) { - HotBlocks.insert(It); - // Breadth First Search to mark edges reachable from hot. - WL.push_back(It); - while (WL.size() > 0) { - It = WL.pop_back_val(); - - for (const BasicBlock *Succ : successors(It)) { - // Do not visit blocks that are cold. - if (!ColdBlocks.count(Succ) && !HotBlocks.count(Succ)) { - HotBlocks.insert(Succ); - WL.push_back(Succ); - } - } - } + // Ensure that the first extracted block is the max ancestor. + if (ColdRegion[0] != MaxAncestor) { + auto AncestorIt = find(ColdRegion, MaxAncestor); + *AncestorIt = ColdRegion[0]; + ColdRegion[0] = MaxAncestor; } - assert(WL.empty() && "work list should be empty"); - - DenseMapBBInt NumHotSuccessors; - // Back propagation: when all successors of a basic block are cold, the - // basic block is cold as well. - for (BasicBlock &BBRef : F) { - const BasicBlock *BB = &BBRef; - if (HotBlocks.count(BB)) { - // Keep a count of hot successors for every hot block. - NumHotSuccessors[BB] = 0; - for (const BasicBlock *Succ : successors(BB)) - if (!ColdBlocks.count(Succ)) - NumHotSuccessors[BB] += 1; - - // Add to work list the blocks with all successors cold. Those are the - // root nodes in the next loop, where we will move those blocks from - // HotBlocks to ColdBlocks and iterate over their predecessors. - if (NumHotSuccessors[BB] == 0) - WL.push_back(BB); + // Find all successors of SinkBB dominated by SinkBB using DFS. + auto SuccIt = ++df_begin(&SinkBB); + auto SuccEnd = df_end(&SinkBB); + while (SuccIt != SuccEnd) { + BasicBlock &SuccBB = **SuccIt; + bool SinkDom = DT.dominates(&SinkBB, &SuccBB); + + // If SinkBB does not dominate a successor, do not mark the successor (or + // any of its successors) cold. + if (!SinkDom || !mayExtractBlock(SuccBB)) { + SuccIt.skipChildren(); + continue; } + + ColdRegion.push_back(&SuccBB); + ++SuccIt; } - while (WL.size() > 0) { - It = WL.pop_back_val(); - if (ColdBlocks.count(It)) + // TODO: Consider outlining regions with just 1 block, but more than some + // threshold of instructions. + if (ColdRegion.size() == 1) + return {}; + + return ColdRegion; +} + +/// Get the largest cold region in \p F. +static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, + BlockFrequencyInfo *BFI, + DominatorTree &DT, PostDomTree &PDT) { + // Keep track of the largest cold region. + BlockSequence LargestColdRegion = {}; + + for (BasicBlock &BB : F) { + // Identify cold blocks. + if (!mayExtractBlock(BB)) + continue; + bool Cold = + PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB)); + if (!Cold) continue; - // Do not back-propagate to blocks that return or have side effects. - if (returnsOrHasSideEffects(*It)) + LLVM_DEBUG({ + dbgs() << "Found cold block:\n"; + BB.dump(); + }); + + // Find a maximal cold region we can outline. + BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT); + if (ColdRegion.empty()) { + LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); continue; + } - // Move the block from HotBlocks to ColdBlocks. - LLVM_DEBUG(llvm::dbgs() << "\nBack propagation marks cold: " << *It); - HotBlocks.erase(It); - ColdBlocks.insert(It); + ++NumColdRegionsFound; - // Iterate over the predecessors. - for (const BasicBlock *Pred : predecessors(It)) { - if (HotBlocks.count(Pred)) { - NumHotSuccessors[Pred] -= 1; + LLVM_DEBUG({ + llvm::dbgs() << "Identified cold region with " << ColdRegion.size() + << " blocks:\n"; + for (BasicBlock *BB : ColdRegion) + BB->dump(); + }); - // If Pred has no more hot successors, add it to the work list. - if (NumHotSuccessors[Pred] == 0) - WL.push_back(Pred); - } - } + // TODO: Outline more than one region. + if (ColdRegion.size() > LargestColdRegion.size()) + LargestColdRegion = std::move(ColdRegion); } - return HotBlocks; + return LargestColdRegion; } class HotColdSplitting { @@ -261,23 +287,9 @@ public: private: bool shouldOutlineFrom(const Function &F) const; - const Function *outlineColdBlocks(Function &F, const DenseSetBB &ColdBlock, - DominatorTree *DT, PostDomTree *PDT); - Function *extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region, - DominatorTree *DT, BlockFrequencyInfo *BFI, + Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, + BlockFrequencyInfo *BFI, OptimizationRemarkEmitter &ORE, unsigned Count); - bool isOutlineCandidate(const SmallVectorImpl<BasicBlock *> &Region, - const BasicBlock *Exit) const { - if (!Exit) - return false; - - // Regions with landing pads etc. - for (const BasicBlock *BB : Region) { - if (BB->isEHPad() || BB->hasAddressTaken()) - return false; - } - return true; - } SmallPtrSet<const Function *, 2> OutlinedFunctions; ProfileSummaryInfo *PSI; function_ref<BlockFrequencyInfo *(Function &)> GetBFI; @@ -314,6 +326,8 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { if (F.size() <= 2) return false; + // TODO: Consider only skipping functions marked `optnone` or `cold`. + if (F.hasAddressTaken()) return false; @@ -331,15 +345,17 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { return true; } -Function *HotColdSplitting::extractColdRegion( - const SmallVectorImpl<BasicBlock *> &Region, DominatorTree *DT, - BlockFrequencyInfo *BFI, OptimizationRemarkEmitter &ORE, unsigned Count) { +Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, + DominatorTree &DT, + BlockFrequencyInfo *BFI, + OptimizationRemarkEmitter &ORE, + unsigned Count) { assert(!Region.empty()); LLVM_DEBUG(for (auto *BB : Region) llvm::dbgs() << "\nExtracting: " << *BB;); // TODO: Pass BFI and BPI to update profile information. - CodeExtractor CE(Region, DT, /* AggregateArgs */ false, /* BFI */ nullptr, + CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, /* BPI */ nullptr, /* AllowVarArgs */ false, /* AllowAlloca */ false, /* Suffix */ "cold." + std::to_string(Count)); @@ -348,15 +364,18 @@ Function *HotColdSplitting::extractColdRegion( CE.findInputsOutputs(Inputs, Outputs, Sinks); // Do not extract regions that have live exit variables. - if (Outputs.size() > 0) + if (Outputs.size() > 0) { + LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n"); return nullptr; + } + // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function. Function *OrigF = Region[0]->getParent(); if (Function *OutF = CE.extractCodeRegion()) { User *U = *OutF->user_begin(); CallInst *CI = cast<CallInst>(U); CallSite CS(CI); - NumColdSESEOutlined++; + NumColdRegionsOutlined++; if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) { OutF->setCallingConv(CallingConv::Cold); CS.setCallingConv(CallingConv::Cold); @@ -388,69 +407,33 @@ Function *HotColdSplitting::extractColdRegion( return nullptr; } -// Return the function created after outlining, nullptr otherwise. -const Function *HotColdSplitting::outlineColdBlocks(Function &F, - const DenseSetBB &HotBlocks, - DominatorTree *DT, - PostDomTree *PDT) { - auto BFI = GetBFI(F); - auto &ORE = (*GetORE)(F); - // Walking the dominator tree allows us to find the largest - // cold region. - BasicBlock *Begin = DT->getRootNode()->getBlock(); - - // Early return if the beginning of the function has been marked cold, - // otherwise all the function gets outlined. - if (PSI->isColdBB(Begin, BFI) || !HotBlocks.count(Begin)) - return nullptr; - - for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) { - BasicBlock *BB = *I; - if (PSI->isColdBB(BB, BFI) || !HotBlocks.count(BB)) { - SmallVector<BasicBlock *, 4> ValidColdRegion, Region; - BasicBlock *Exit = (*PDT)[BB]->getIDom()->getBlock(); - BasicBlock *ExitColdRegion = nullptr; - - // Estimated cold region between a BB and its dom-frontier. - while (Exit && isSingleEntrySingleExit(BB, Exit, DT, PDT, Region) && - isOutlineCandidate(Region, Exit)) { - ExitColdRegion = Exit; - ValidColdRegion = Region; - Region.clear(); - // Update Exit recursively to its dom-frontier. - Exit = (*PDT)[Exit]->getIDom()->getBlock(); - } - if (ExitColdRegion) { - // Do not outline a region with only one block. - if (ValidColdRegion.size() == 1) - continue; - - ++NumColdSESEFound; - ValidColdRegion.push_back(ExitColdRegion); - // Candidate for outlining. FIXME: Continue outlining. - return extractColdRegion(ValidColdRegion, DT, BFI, ORE, /* Count */ 1); - } - } - } - return nullptr; -} - bool HotColdSplitting::run(Module &M) { + bool Changed = false; for (auto &F : M) { - if (!shouldOutlineFrom(F)) + if (!shouldOutlineFrom(F)) { + LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n"); continue; + } + + LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); DominatorTree DT(F); PostDomTree PDT(F); PDT.recalculate(F); - DenseSetBB HotBlocks; - if (EnableStaticAnalyis) // Static analysis of cold blocks. - HotBlocks = getHotBlocks(F); + BlockFrequencyInfo *BFI = GetBFI(F); - const Function *Outlined = outlineColdBlocks(F, HotBlocks, &DT, &PDT); - if (Outlined) + BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT); + if (ColdRegion.empty()) + continue; + + OptimizationRemarkEmitter &ORE = (*GetORE)(F); + Function *Outlined = + extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1); + if (Outlined) { OutlinedFunctions.insert(Outlined); + Changed = true; + } } - return true; + return Changed; } bool HotColdSplittingLegacyPass::runOnModule(Module &M) { diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 328fe1fac65..462dc588cd5 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1273,24 +1273,32 @@ Function *CodeExtractor::extractCodeRegion() { // Look at all successors of the codeReplacer block. If any of these blocks // had PHI nodes in them, we need to update the "from" block to be the code // replacer, not the original block in the extracted region. - std::vector<BasicBlock *> Succs(succ_begin(codeReplacer), - succ_end(codeReplacer)); - for (unsigned i = 0, e = Succs.size(); i != e; ++i) - for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) { - PHINode *PN = cast<PHINode>(I); - std::set<BasicBlock*> ProcessedPreds; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (Blocks.count(PN->getIncomingBlock(i))) { - if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second) - PN->setIncomingBlock(i, codeReplacer); - else { - // There were multiple entries in the PHI for this block, now there - // is only one, so remove the duplicated entries. - PN->removeIncomingValue(i, false); - --i; --e; - } + for (BasicBlock *SuccBB : successors(codeReplacer)) { + for (PHINode &PN : SuccBB->phis()) { + Value *IncomingCodeReplacerVal = nullptr; + SmallVector<unsigned, 2> IncomingValsToRemove; + for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) { + BasicBlock *IncomingBB = PN.getIncomingBlock(I); + + // Ignore incoming values from outside of the extracted region. + if (!Blocks.count(IncomingBB)) + continue; + + // Ensure that there is only one incoming value from codeReplacer. + if (!IncomingCodeReplacerVal) { + PN.setIncomingBlock(I, codeReplacer); + IncomingCodeReplacerVal = PN.getIncomingValue(I); + } else { + assert(IncomingCodeReplacerVal == PN.getIncomingValue(I) && + "PHI has two incompatbile incoming values from codeRepl"); + IncomingValsToRemove.push_back(I); } + } + + for (unsigned I : reverse(IncomingValsToRemove)) + PN.removeIncomingValue(I, /*DeletePHIIfEmpty=*/false); } + } // Erase debug info intrinsics. Variable updates within the new function are // invisible to debuggers. This could be improved by defining a DISubprogram |