diff options
author | Kyle Butt <kyle+llvm@iteratee.net> | 2016-10-11 20:36:43 +0000 |
---|---|---|
committer | Kyle Butt <kyle+llvm@iteratee.net> | 2016-10-11 20:36:43 +0000 |
commit | 0846e56e631387297728eef680c394af8e086a2c (patch) | |
tree | 0ba811be2701a63c0f5e28b281c8f62e8acf0d84 /llvm/lib/CodeGen/MachineBlockPlacement.cpp | |
parent | 5bb08d8f5ab7bfbb40fa101507f2a5e676d643cf (diff) | |
download | bcm5719-llvm-0846e56e631387297728eef680c394af8e086a2c.tar.gz bcm5719-llvm-0846e56e631387297728eef680c394af8e086a2c.zip |
Codegen: Tail-duplicate during placement.
The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.
In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.
This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.
Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.
Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.
Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll
Differential revision: https://reviews.llvm.org/D18226
llvm-svn: 283934
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 305 |
1 files changed, 278 insertions, 27 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 90a576a6621..aa440b2e1ca 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -121,6 +122,12 @@ static cl::opt<unsigned> MisfetchCost( static cl::opt<unsigned> JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt<bool> +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt<bool> BranchFoldPlacement("branch-fold-placement", @@ -128,6 +135,14 @@ BranchFoldPlacement("branch-fold-placement", "Reduces code size."), cl::init(true), cl::Hidden); +// Heuristic for tail duplication. +static cl::opt<unsigned> TailDuplicatePlacementThreshold( + "tail-dup-placement-threshold", + cl::desc("Instruction cutoff for tail duplication during layout. " + "Tail merging during layout is forced to have a threshold " + "that won't conflict."), cl::init(2), + cl::Hidden); + extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; @@ -185,6 +200,16 @@ public: /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + bool remove(MachineBasicBlock* BB) { + for(iterator i = begin(); i != end(); ++i) { + if (*i == BB) { + Blocks.erase(i); + return true; + } + } + return false; + } + /// \brief Merge a block chain into this one. /// /// This routine merges a block chain into this one. It takes care of forming @@ -266,6 +291,13 @@ class MachineBlockPlacement : public MachineFunctionPass { /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; @@ -287,8 +319,18 @@ class MachineBlockPlacement : public MachineFunctionPass { /// between basic blocks. DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; + /// Decrease the UnscheduledPredecessors count for all blocks in chain, and + /// if the count goes to 0, add them to the appropriate work list. void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter = nullptr); + + /// Decrease the UnscheduledPredecessors count for a single block, and + /// if the count goes to 0, add them to the appropriate work list. + void markBlockSuccessors( + BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); + + BranchProbability collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, @@ -298,6 +340,16 @@ class MachineBlockPlacement : public MachineFunctionPass { const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + bool repeatedlyTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *&LPred, + MachineBasicBlock *LoopHeaderBB, + BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt); + bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, + const BlockChain &Chain, + BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &DuplicatedToPred); bool hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, BranchProbability SuccProb, @@ -323,7 +375,7 @@ class MachineBlockPlacement : public MachineFunctionPass { SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter); void buildChain(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter = nullptr); + BlockFilterSet *BlockFilter = nullptr); MachineBasicBlock *findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit(MachineLoop &L, @@ -388,37 +440,49 @@ static std::string getBlockName(MachineBasicBlock *BB) { /// When a chain is being merged into the "placed" chain, this routine will /// quickly walk the successors of each block in the chain and mark them as /// having one fewer active predecessor. It also adds any successors of this -/// chain which reach the zero-predecessor state to the worklist passed in. +/// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. for (MachineBasicBlock *MBB : Chain) { - // Add any successors for which this is the only un-placed in-loop - // predecessor to the worklist as a viable candidate for CFG-neutral - // placement. No subsequent placement of this block will violate the CFG - // shape, so we get to use heuristics to choose a favorable placement. - for (MachineBasicBlock *Succ : MBB->successors()) { - if (BlockFilter && !BlockFilter->count(Succ)) - continue; - BlockChain &SuccChain = *BlockToChain[Succ]; - // Disregard edges within a fixed chain, or edges to the loop header. - if (&Chain == &SuccChain || Succ == LoopHeaderBB) - continue; + markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter); + } +} - // This is a cross-chain edge that is within the loop, so decrement the - // loop predecessor count of the destination chain. - if (SuccChain.UnscheduledPredecessors == 0 || - --SuccChain.UnscheduledPredecessors > 0) - continue; +/// \brief Mark a single block's successors as having one fewer preds. +/// +/// Under normal circumstances, this is only called by markChainSuccessors, +/// but if a block that was to be placed is completely tail-duplicated away, +/// and was duplicated into the chain end, we need to redo markBlockSuccessors +/// for just that block. +void MachineBlockPlacement::markBlockSuccessors( + BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter) { + // Add any successors for which this is the only un-placed in-loop + // predecessor to the worklist as a viable candidate for CFG-neutral + // placement. No subsequent placement of this block will violate the CFG + // shape, so we get to use heuristics to choose a favorable placement. + for (MachineBasicBlock *Succ : MBB->successors()) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain &SuccChain = *BlockToChain[Succ]; + // Disregard edges within a fixed chain, or edges to the loop header. + if (&Chain == &SuccChain || Succ == LoopHeaderBB) + continue; - auto *MBB = *SuccChain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); - else - BlockWorkList.push_back(MBB); - } + // This is a cross-chain edge that is within the loop, so decrement the + // loop predecessor count of the destination chain. + if (SuccChain.UnscheduledPredecessors == 0 || + --SuccChain.UnscheduledPredecessors > 0) + continue; + + auto *NewBB = *SuccChain.begin(); + if (NewBB->isEHPad()) + EHPadWorkList.push_back(NewBB); + else + BlockWorkList.push_back(NewBB); } } @@ -902,7 +966,7 @@ void MachineBlockPlacement::fillWorkLists( void MachineBlockPlacement::buildChain( MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter) { + BlockFilterSet *BlockFilter) { assert(BB && "BB must not be null.\n"); assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); @@ -937,6 +1001,17 @@ void MachineBlockPlacement::buildChain( "layout successor until the CFG reduces\n"); } + // Placement may have changed tail duplication opportunities. + // Check for that now. + if (TailDupPlacement && BestSucc) { + // If the chosen successor was duplicated into all its predecessors, + // don't bother laying it out, just go round the loop again with BB as + // the chain end. + if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, + BlockFilter, PrevUnplacedBlockIt)) + continue; + } + // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; // Zero out UnscheduledPredecessors for the successor we're about to merge in case @@ -1718,6 +1793,174 @@ void MachineBlockPlacement::alignBlocks() { } } +/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if +/// it was duplicated into its chain predecessor and removed. +/// \p BB - Basic block that may be duplicated. +/// +/// \p LPred - Chosen layout predecessor of \p BB. +/// Updated to be the chain end if LPred is removed. +/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. +/// \p BlockFilter - Set of blocks that belong to the loop being laid out. +/// Used to identify which blocks to update predecessor +/// counts. +/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was +/// chosen in the given order due to unnatural CFG +/// only needed if \p BB is removed and +/// \p PrevUnplacedBlockIt pointed to \p BB. +/// @return true if \p BB was removed. +bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *&LPred, + MachineBasicBlock *LoopHeaderBB, + BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt) { + bool Removed, DuplicatedToLPred; + bool DuplicatedToOriginalLPred; + Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter, + PrevUnplacedBlockIt, + DuplicatedToLPred); + if (!Removed) + return false; + DuplicatedToOriginalLPred = DuplicatedToLPred; + // Iteratively try to duplicate again. It can happen that a block that is + // duplicated into is still small enough to be duplicated again. + // No need to call markBlockSuccessors in this case, as the blocks being + // duplicated from here on are already scheduled. + // Note that DuplicatedToLPred always implies Removed. + while (DuplicatedToLPred) { + assert (Removed && "Block must have been removed to be duplicated into its " + "layout predecessor."); + MachineBasicBlock *DupBB, *DupPred; + // The removal callback causes Chain.end() to be updated when a block is + // removed. On the first pass through the loop, the chain end should be the + // same as it was on function entry. On subsequent passes, because we are + // duplicating the block at the end of the chain, if it is removed the + // chain will have shrunk by one block. + BlockChain::iterator ChainEnd = Chain.end(); + DupBB = *(--ChainEnd); + // Now try to duplicate again. + if (ChainEnd == Chain.begin()) + break; + DupPred = *std::prev(ChainEnd); + Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter, + PrevUnplacedBlockIt, + DuplicatedToLPred); + } + // If BB was duplicated into LPred, it is now scheduled. But because it was + // removed, markChainSuccessors won't be called for its chain. Instead we + // call markBlockSuccessors for LPred to achieve the same effect. This must go + // at the end because repeating the tail duplication can increase the number + // of unscheduled predecessors. + LPred = *std::prev(Chain.end()); + if (DuplicatedToOriginalLPred) + markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter); + return true; +} + +/// Tail duplicate \p BB into (some) predecessors if profitable. +/// \p BB - Basic block that may be duplicated +/// \p LPred - Chosen layout predecessor of \p BB +/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. +/// \p BlockFilter - Set of blocks that belong to the loop being laid out. +/// Used to identify which blocks to update predecessor +/// counts. +/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was +/// chosen in the given order due to unnatural CFG +/// only needed if \p BB is removed and +/// \p PrevUnplacedBlockIt pointed to \p BB. +/// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will +/// only be true if the block was removed. +/// \return - True if the block was duplicated into all preds and removed. +bool MachineBlockPlacement::maybeTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *LPred, + const BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &DuplicatedToLPred) { + + DuplicatedToLPred = false; + DEBUG(dbgs() << "Redoing tail duplication for Succ#" + << BB->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BB); + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + if (!IsSimple && BB->succ_size() == 1) + return false; + if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) + return false; + // This has to be a callback because none of it can be done after + // BB is deleted. + bool Removed = false; + auto RemovalCallback = + [&](MachineBasicBlock *RemBB) { + // Signal to outer function + Removed = true; + + // Conservative default. + bool InWorkList = true; + // Remove from the Chain and Chain Map + if (BlockToChain.count(RemBB)) { + BlockChain *Chain = BlockToChain[RemBB]; + InWorkList = Chain->UnscheduledPredecessors == 0; + Chain->remove(RemBB); + BlockToChain.erase(RemBB); + } + + // Handle the unplaced block iterator + if (&(*PrevUnplacedBlockIt) == RemBB) { + PrevUnplacedBlockIt++; + } + + // Handle the Work Lists + if (InWorkList) { + SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList; + if (RemBB->isEHPad()) + RemoveList = EHPadWorkList; + RemoveList.erase( + remove_if(RemoveList, + [RemBB](MachineBasicBlock *BB) {return BB == RemBB;}), + RemoveList.end()); + } + + // Handle the filter set + if (BlockFilter) { + BlockFilter->erase(RemBB); + } + + // Remove the block from loop info. + MLI->removeBlock(RemBB); + + // TailDuplicator handles removing it from loops. + DEBUG(dbgs() << "TailDuplicator deleted block: " + << getBlockName(RemBB) << "\n"); + }; + auto RemovalCallbackRef = + llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback); + + SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; + TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, + &DuplicatedPreds, &RemovalCallbackRef); + + // Update UnscheduledPredecessors to reflect tail-duplication. + DuplicatedToLPred = false; + for (MachineBasicBlock *Pred : DuplicatedPreds) { + // We're only looking for unscheduled predecessors that match the filter. + BlockChain* PredChain = BlockToChain[Pred]; + if (Pred == LPred) + DuplicatedToLPred = true; + if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) + || PredChain == &Chain) + continue; + for (MachineBasicBlock *NewSucc : Pred->successors()) { + if (BlockFilter && !BlockFilter->count(NewSucc)) + continue; + BlockChain *NewChain = BlockToChain[NewSucc]; + if (NewChain != &Chain && NewChain != PredChain) + NewChain->UnscheduledPredecessors++; + } + } + return Removed; +} + bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1734,6 +1977,13 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis<MachineDominatorTree>(); + if (TailDupPlacement) { + unsigned TailDupSize = TailDuplicatePlacementThreshold; + if (MF.getFunction()->optForSize()) + TailDupSize = 1; + TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + } + assert(BlockToChain.empty()); buildCFGChains(); @@ -1747,8 +1997,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - // Default to the standard tail-merge-size option. - unsigned TailMergeSize = 0; + unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); @@ -1757,6 +2006,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); + // Must redo the dominator tree if blocks were changed. + MDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } |