summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Searles <m.c.searles@gmail.com>2018-04-19 15:42:30 +0000
committerMark Searles <m.c.searles@gmail.com>2018-04-19 15:42:30 +0000
commit1bc6e71f32ebadefc319c65b3f650e08e39d6aa7 (patch)
treebbb954896e398ff8029b87d948ef765e7c5a9eba
parent9b20c245ca36998cb2ac6e81543f87b781a9d06b (diff)
downloadbcm5719-llvm-1bc6e71f32ebadefc319c65b3f650e08e39d6aa7.tar.gz
bcm5719-llvm-1bc6e71f32ebadefc319c65b3f650e08e39d6aa7.zip
[AMDGPU] Do not only rely on BB number when finding bottom loop
We should also check that the "bottom" basic block of a loopis a successor of the "header" basic block, otherwise we don't propagate the information correctly when the CFG is complex. This fixes an important rendering problem with Wolfsentein 2, because of one vector-memory wait was missing. Differential Revision: https://reviews.llvm.org/D43831 llvm-svn: 330337
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp65
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir59
2 files changed, 104 insertions, 20 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5eae1197ba2..bed4a705b0c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -367,7 +367,7 @@ private:
DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
BlockWaitcntBracketsMap;
- DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+ std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
@@ -403,7 +403,8 @@ public:
void updateEventWaitCntAfter(MachineInstr &Inst,
BlockWaitcntBrackets *ScoreBrackets);
void mergeInputScoreBrackets(MachineBasicBlock &Block);
- MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+ bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+ unsigned countNumBottomBlocks(const MachineLoop *Loop);
void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
bool isWaitcntStronger(unsigned LHS, unsigned RHS);
@@ -1568,15 +1569,29 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
}
}
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
- MachineBasicBlock *Bottom = Loop->getHeader();
- for (MachineBasicBlock *MBB : Loop->blocks())
- if (MBB->getNumber() > Bottom->getNumber())
- Bottom = MBB;
- return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop. This
+/// differs from MachineLoop::getBottomBlock in that it works even if the loop
+/// is discontiguous. This also handles multiple back-edges for the same
+/// "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+ const MachineBasicBlock *Block) {
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+ unsigned Count = 0;
+ for (MachineBasicBlock *MBB : Loop->blocks()) {
+ if (MBB->isSuccessor(Loop->getHeader())) {
+ Count++;
+ }
+ }
+ return Count;
}
// Generate s_waitcnt instructions where needed.
@@ -1685,7 +1700,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Check if we need to force convergence at loop footer.
MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
- if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->print();
DEBUG(dbgs() << '\n';);
@@ -1773,6 +1788,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TrackedWaitcntSet.clear();
BlockVisitedSet.clear();
VCCZBugHandledSet.clear();
+ LoopWaitcntDataMap.clear();
// Walk over the blocks in reverse post-dominator order, inserting
// s_waitcnt where needed.
@@ -1799,21 +1815,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// If we are walking into the block from before the loop, then guarantee
// at least 1 re-walk over the loop to propagate the information, even if
// no S_WAITCNT instructions were generated.
- if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
- (!BlockWaitcntProcessedSet.count(&MBB))) {
- BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: Block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+ unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+ // If the loop has multiple back-edges, and so more than one "bottom"
+ // basic block, we have to guarantee a re-walk over every blocks.
+ if ((std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
+ BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+ DEBUG(dbgs() << "set-revisit: Block"
+ << ContainingLoop->getHeader()->getNumber() << '\n';);
+ }
}
// Walk over the instructions.
insertWaitcntInBlock(MF, MBB);
// Flag that waitcnts have been processed at least once.
- BlockWaitcntProcessedSet.insert(&MBB);
+ BlockWaitcntProcessedSet.push_back(&MBB);
- // See if we want to revisit the loop.
- if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+ // See if we want to revisit the loop. If a loop has multiple back-edges,
+ // we shouldn't revisit the same "bottom" basic block.
+ if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+ std::count(BlockWaitcntProcessedSet.begin(),
+ BlockWaitcntProcessedSet.end(), &MBB) == 1) {
MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
if (EntrySB && EntrySB->getRevisitLoop()) {
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
new file mode 100644
index 00000000000..2d9ec03132b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: waitcnt-back-edge-loop
+# GCN: bb.2
+# GCN: S_WAITCNT 112
+# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+
+---
+name: waitcnt-back-edge-loop
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
+ $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec
+ $vgpr3 = V_CNDMASK_B32_e64 -1082130432, 1065353216, killed $sgpr0_sgpr1, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.3:
+ successors: %bb.1
+
+ $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+
+ bb.1:
+ successors: %bb.5, %bb.2
+
+ $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ S_CBRANCH_VCCZ %bb.5, implicit killed $vcc
+
+ bb.2:
+ successors: %bb.4, %bb.3
+
+ V_CMP_EQ_U32_e32 9, killed $vgpr5, implicit-def $vcc, implicit $exec
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ S_CBRANCH_VCCZ %bb.3, implicit killed $vcc
+
+ bb.4:
+ successors: %bb.3, %bb.1
+
+ $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+ $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec
+ V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ $vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+ S_BRANCH %bb.3
+
+ bb.5:
+
+ $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec
+ EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
+ S_ENDPGM
+...
OpenPOWER on IntegriCloud