diff options
| author | Wenlei He <aktoon@gmail.com> | 2019-08-11 06:05:35 +0000 |
|---|---|---|
| committer | Wenlei He <aktoon@gmail.com> | 2019-08-11 06:05:35 +0000 |
| commit | 7e71aa24bc0788690fea7f0d7eab400c6a784deb (patch) | |
| tree | 4ea81a9f29b5cb2b8f2d69c254bd91e76cd4edbd /llvm/lib/Transforms | |
| parent | d664072dd5ec60a7ef944ad1de90b5b8761f003e (diff) | |
| download | bcm5719-llvm-7e71aa24bc0788690fea7f0d7eab400c6a784deb.tar.gz bcm5719-llvm-7e71aa24bc0788690fea7f0d7eab400c6a784deb.zip | |
[LICM] Make Loop ICM profile aware
Summary:
Hoisting/sinking instruction out of a loop isn't always beneficial. Hoisting an instruction from a cold block inside a loop body out of the loop could hurt performance. This change makes Loop ICM profile aware - it now checks block frequency to make sure hoisting/sinking anly moves instruction to colder block.
Test Plan:
ninja check
Reviewers: asbirlea, sanjoy, reames, nikic, hfinkel, vsk
Reviewed By: asbirlea
Subscribers: fhahn, vsk, davidxl, xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65060
llvm-svn: 368526
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LICM.cpp | 90 |
1 files changed, 73 insertions, 17 deletions
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 572b69fb890..546c563069b 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -95,6 +95,11 @@ static cl::opt<bool> ControlFlowHoisting( "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); +static cl::opt<unsigned> HoistSinkColdnessThreshold( + "licm-coldness-threshold", cl::Hidden, cl::init(4), + cl::desc("Relative coldness Threshold of hoisting/sinking destination " + "block for LICM to be considered beneficial")); + static cl::opt<uint32_t> MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -139,8 +144,9 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -168,8 +174,8 @@ namespace { struct LoopInvariantCodeMotion { using ASTrackerMapTy = DenseMap<Loop *, std::unique_ptr<AliasSetTracker>>; bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST); ASTrackerMapTy &getLoopToAliasSetMap() { return LoopToAliasSetMap; } @@ -220,6 +226,7 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis<AAResultsWrapperPass>().getAAResults(), &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(), &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()), @@ -230,6 +237,7 @@ struct LegacyLICMPass : public LoopPass { /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); @@ -286,7 +294,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, "cached at a higher level"); LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); - if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE, + auto BFI = FAM.getCachedResult<BlockFrequencyAnalysis>(*F); + if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, ORE, true)) return PreservedAnalyses::all(); @@ -324,8 +333,9 @@ Pass *llvm::createLICMPass(unsigned LicmMssaOptCap, /// bool LoopInvariantCodeMotion::runOnLoop( Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, - MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) { + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, + bool DeleteAST) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -385,11 +395,11 @@ bool LoopInvariantCodeMotion::runOnLoop( LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true}; if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any @@ -491,9 +501,10 @@ bool LoopInvariantCodeMotion::runOnLoop( /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, Loop *CurLoop, - AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, AliasSetTracker *CurAST, + MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { @@ -542,7 +553,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && !I.mayHaveSideEffects()) { - if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) { + if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); @@ -786,13 +797,43 @@ public: }; } // namespace +// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only +// only worthwhile if the destination block is actually colder than current +// block. +static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, + OptimizationRemarkEmitter *ORE, + BlockFrequencyInfo *BFI) { + // Check block frequency only when runtime profile is available. + // to avoid pathological cases. With static profile, lean towards + // hosting because it helps canonicalize the loop for vectorizer. + if (!DstBlock->getParent()->hasProfileData()) + return true; + + if (!HoistSinkColdnessThreshold || !BFI) + return true; + + BasicBlock *SrcBlock = I.getParent(); + if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > + BFI->getBlockFreq(SrcBlock).getFrequency()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) + << "failed to sink or hoist instruction because containing block " + "has lower frequency than destination block"; + }); + return false; + } + + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -843,13 +884,15 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. // TODO: It may be safe to hoist if we are hoisting to a conditional block // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && + worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -1550,8 +1593,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) @@ -1627,7 +1671,10 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. + // First check if I is worth sinking for all uses. Sink only when it is worth + // across all uses. SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end()); + SmallVector<PHINode*, 8> ExitPNs; for (auto *UI : Users) { auto *User = cast<Instruction>(UI); @@ -1637,6 +1684,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, PHINode *PN = cast<PHINode>(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); + + if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { + return Changed; + } + + ExitPNs.push_back(PN); + } + + for (auto *PN: ExitPNs) { // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); |

