[AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks

In terms of waitcnt insertion/if necessary, the waitcnt pass forces convergence for a loop. Previously, that kicked if greater than 2 passes over a loop, which doesn't account for loop with many bottom blocks. So, increase the threshold to (n+1), where n is the number of bottom blocks. This gives the pass an opportunity to consider the contribution of each bottom block, to the overall loop, before the forced convergence potentially kicks in. Differential Revision: https://reviews.llvm.org/D47488 llvm-svn: 333556
author: Mark Searles <m.c.searles@gmail.com> 2018-05-30 15:47:45 +0000
committer: Mark Searles <m.c.searles@gmail.com> 2018-05-30 15:47:45 +0000
commit: 1054541490dbb538fe67408598e93b5494ba7b25 (patch)
tree: 126e538fc9e508e32da900be77280448c039b2b9 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent: 70d8d5107378d84d0d2a2130750024f47bc0d400 (diff)
download: bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.tar.gz
bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.zip
1 files changed, 23 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0e1c2bc3172..37fecd6ab96 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -345,7 +345,7 @@ public:
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
@@ -1205,7 +1205,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
           }
           ScoreBracket->setRevisitLoop(true);
           LLVM_DEBUG(dbgs()
-                         << "set-revisit: Block"
+                         << "set-revisit2: Block"
                          << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
@@ -1639,10 +1639,9 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   }
 }
 
-/// Return true if the given basic block is a "bottom" block of a loop. This
-/// differs from MachineLoop::getBottomBlock in that it works even if the loop
-/// is discontiguous. This also handles multiple back-edges for the same
-/// "header" block of a loop.
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
                                     const MachineBasicBlock *Block) {
   for (MachineBasicBlock *MBB : Loop->blocks()) {
@@ -1776,11 +1775,12 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
       // As a simplification, instead of tracking individual scores and
@@ -1792,16 +1792,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
           TrackedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1898,7 +1898,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
       if ((std::count(BlockWaitcntProcessedSet.begin(),
                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit: Block"
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
                           << ContainingLoop->getHeader()->getNumber() << '\n';);
       }
     }
@@ -1906,7 +1906,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
+    // Record that waitcnts have been processed at least once for this block.
     BlockWaitcntProcessedSet.push_back(&MBB);
 
     // See if we want to revisit the loop. If a loop has multiple back-edges,
@@ -2004,8 +2004,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
     MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
+    auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(),
+                             DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(0);
+
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+               << "New Instr: " << *SWaitInst << '\n');
 
     Modified = true;
   }
author	Mark Searles <m.c.searles@gmail.com>	2018-05-30 15:47:45 +0000
committer	Mark Searles <m.c.searles@gmail.com>	2018-05-30 15:47:45 +0000
commit	1054541490dbb538fe67408598e93b5494ba7b25 (patch)
tree	126e538fc9e508e32da900be77280448c039b2b9 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent	70d8d5107378d84d0d2a2130750024f47bc0d400 (diff)
download	bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.tar.gz bcm5719-llvm-1054541490dbb538fe67408598e93b5494ba7b25.zip