diff options
author | Mark Searles <m.c.searles@gmail.com> | 2018-05-30 15:47:45 +0000 |
---|---|---|
committer | Mark Searles <m.c.searles@gmail.com> | 2018-05-30 15:47:45 +0000 |
commit | 1054541490dbb538fe67408598e93b5494ba7b25 (patch) | |
tree | 126e538fc9e508e32da900be77280448c039b2b9 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | |
parent | 70d8d5107378d84d0d2a2130750024f47bc0d400 (diff) | |
download | bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.tar.gz bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.zip |
[AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks
In terms of waitcnt insertion/if necessary, the waitcnt pass forces convergence
for a loop. Previously, that kicked if greater than 2 passes over a loop, which
doesn't account for loop with many bottom blocks. So, increase the threshold to
(n+1), where n is the number of bottom blocks. This gives the pass an
opportunity to consider the contribution of each bottom block, to the overall
loop, before the forced convergence potentially kicks in.
Differential Revision: https://reviews.llvm.org/D47488
llvm-svn: 333556
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 42 |
1 files changed, 23 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 0e1c2bc3172..37fecd6ab96 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -345,7 +345,7 @@ public: void incIterCnt() { IterCnt++; } void resetIterCnt() { IterCnt = 0; } - int32_t getIterCnt() { return IterCnt; } + unsigned getIterCnt() { return IterCnt; } void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } MachineInstr *getWaitcnt() const { return LfWaitcnt; } @@ -1205,7 +1205,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( } ScoreBracket->setRevisitLoop(true); LLVM_DEBUG(dbgs() - << "set-revisit: Block" + << "set-revisit2: Block" << ContainingLoop->getHeader()->getNumber() << '\n';); } } @@ -1639,10 +1639,9 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { } } -/// Return true if the given basic block is a "bottom" block of a loop. This -/// differs from MachineLoop::getBottomBlock in that it works even if the loop -/// is discontiguous. This also handles multiple back-edges for the same -/// "header" block of a loop. +/// Return true if the given basic block is a "bottom" block of a loop. +/// This works even if the loop is discontiguous. This also handles +/// multiple back-edges for the same "header" block of a loop. bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block) { for (MachineBasicBlock *MBB : Loop->blocks()) { @@ -1776,11 +1775,12 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, LLVM_DEBUG(dbgs() << '\n';); // The iterative waitcnt insertion algorithm aims for optimal waitcnt - // placement and doesn't always guarantee convergence for a loop. Each - // loop should take at most 2 iterations for it to converge naturally. - // When this max is reached and result doesn't converge, we force - // convergence by inserting a s_waitcnt at the end of loop footer. - if (WaitcntData->getIterCnt() > 2) { + // placement, but doesn't guarantee convergence for a loop. Each + // loop should take at most (n+1) iterations for it to converge naturally, + // where n is the number of bottom blocks. If this threshold is reached and + // the result hasn't converged, then we force convergence by inserting + // a s_waitcnt at the end of loop footer. + if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) { // To ensure convergence, need to make wait events at loop footer be no // more than those from the previous iteration. // As a simplification, instead of tracking individual scores and @@ -1792,16 +1792,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); HasPending = true; + break; } } if (HasPending) { if (!SWaitInst) { - SWaitInst = Block.getParent()->CreateMachineInstr( - TII->get(AMDGPU::S_WAITCNT), DebugLoc()); + SWaitInst = BuildMI(Block, Block.getFirstNonPHI(), + DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); TrackedWaitcntSet.insert(SWaitInst); - const MachineOperand &Op = MachineOperand::CreateImm(0); - SWaitInst->addOperand(MF, Op); #if 0 // TODO: Format the debug output OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); OutputTransformAdd(SWaitInst, context); @@ -1898,7 +1898,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if ((std::count(BlockWaitcntProcessedSet.begin(), BlockWaitcntProcessedSet.end(), &MBB) < Count)) { BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit: Block" + LLVM_DEBUG(dbgs() << "set-revisit1: Block" << ContainingLoop->getHeader()->getNumber() << '\n';); } } @@ -1906,7 +1906,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // Walk over the instructions. insertWaitcntInBlock(MF, MBB); - // Flag that waitcnts have been processed at least once. + // Record that waitcnts have been processed at least once for this block. BlockWaitcntProcessedSet.push_back(&MBB); // See if we want to revisit the loop. If a loop has multiple back-edges, @@ -2004,8 +2004,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(), + DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "New Instr: " << *SWaitInst << '\n'); Modified = true; } |