summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
diff options
context:
space:
mode:
authorMark Searles <m.c.searles@gmail.com>2018-05-30 15:47:45 +0000
committerMark Searles <m.c.searles@gmail.com>2018-05-30 15:47:45 +0000
commit1054541490dbb538fe67408598e93b5494ba7b25 (patch)
tree126e538fc9e508e32da900be77280448c039b2b9 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent70d8d5107378d84d0d2a2130750024f47bc0d400 (diff)
downloadbcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.tar.gz
bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.zip
[AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks
In terms of waitcnt insertion/if necessary, the waitcnt pass forces convergence for a loop. Previously, that kicked if greater than 2 passes over a loop, which doesn't account for loop with many bottom blocks. So, increase the threshold to (n+1), where n is the number of bottom blocks. This gives the pass an opportunity to consider the contribution of each bottom block, to the overall loop, before the forced convergence potentially kicks in. Differential Revision: https://reviews.llvm.org/D47488 llvm-svn: 333556
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp42
1 files changed, 23 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0e1c2bc3172..37fecd6ab96 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -345,7 +345,7 @@ public:
void incIterCnt() { IterCnt++; }
void resetIterCnt() { IterCnt = 0; }
- int32_t getIterCnt() { return IterCnt; }
+ unsigned getIterCnt() { return IterCnt; }
void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
MachineInstr *getWaitcnt() const { return LfWaitcnt; }
@@ -1205,7 +1205,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
}
ScoreBracket->setRevisitLoop(true);
LLVM_DEBUG(dbgs()
- << "set-revisit: Block"
+ << "set-revisit2: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
@@ -1639,10 +1639,9 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
}
}
-/// Return true if the given basic block is a "bottom" block of a loop. This
-/// differs from MachineLoop::getBottomBlock in that it works even if the loop
-/// is discontiguous. This also handles multiple back-edges for the same
-/// "header" block of a loop.
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
const MachineBasicBlock *Block) {
for (MachineBasicBlock *MBB : Loop->blocks()) {
@@ -1776,11 +1775,12 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
LLVM_DEBUG(dbgs() << '\n';);
// The iterative waitcnt insertion algorithm aims for optimal waitcnt
- // placement and doesn't always guarantee convergence for a loop. Each
- // loop should take at most 2 iterations for it to converge naturally.
- // When this max is reached and result doesn't converge, we force
- // convergence by inserting a s_waitcnt at the end of loop footer.
- if (WaitcntData->getIterCnt() > 2) {
+ // placement, but doesn't guarantee convergence for a loop. Each
+ // loop should take at most (n+1) iterations for it to converge naturally,
+ // where n is the number of bottom blocks. If this threshold is reached and
+ // the result hasn't converged, then we force convergence by inserting
+ // a s_waitcnt at the end of loop footer.
+ if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.
// As a simplification, instead of tracking individual scores and
@@ -1792,16 +1792,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
HasPending = true;
+ break;
}
}
if (HasPending) {
if (!SWaitInst) {
- SWaitInst = Block.getParent()->CreateMachineInstr(
- TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+ SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+ DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
TrackedWaitcntSet.insert(SWaitInst);
- const MachineOperand &Op = MachineOperand::CreateImm(0);
- SWaitInst->addOperand(MF, Op);
#if 0 // TODO: Format the debug output
OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
OutputTransformAdd(SWaitInst, context);
@@ -1898,7 +1898,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if ((std::count(BlockWaitcntProcessedSet.begin(),
BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- LLVM_DEBUG(dbgs() << "set-revisit: Block"
+ LLVM_DEBUG(dbgs() << "set-revisit1: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
@@ -1906,7 +1906,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// Walk over the instructions.
insertWaitcntInBlock(MF, MBB);
- // Flag that waitcnts have been processed at least once.
+ // Record that waitcnts have been processed at least once for this block.
BlockWaitcntProcessedSet.push_back(&MBB);
// See if we want to revisit the loop. If a loop has multiple back-edges,
@@ -2004,8 +2004,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
+ auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(),
+ DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "New Instr: " << *SWaitInst << '\n');
Modified = true;
}
OpenPOWER on IntegriCloud