[AMDGPU][Waitcnt] Fix handling of flat instrs

On GFX9 and earlier, flat memory ops may decrement VMCNT out-of-order as well as LGKMCNT out-of-order. Differential Revision: https://reviews.llvm.org/D46616 llvm-svn: 333926
author: Mark Searles <m.c.searles@gmail.com> 2018-06-04 16:51:59 +0000
committer: Mark Searles <m.c.searles@gmail.com> 2018-06-04 16:51:59 +0000
commit: f0b93f1e9e649dde693c40ce4fc929fba068abae (patch)
tree: 43f6f7511a6824b1eed456d8eafa0160d48e4234 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent: 7c000d426795332ba1c11bb8798c891fb6687279 (diff)
download: bcm5719-llvm-f0b93f1e9e649dde693c40ce4fc929fba068abae.tar.gz
bcm5719-llvm-f0b93f1e9e649dde693c40ce4fc929fba068abae.zip
1 files changed, 10 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e0e07a937e0..fb24d528c33 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -136,7 +136,7 @@ enum RegisterMapping {
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
-  BlockWaitcntBrackets() {
+  BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -314,6 +314,7 @@ public:
   void dump() { print(dbgs()); }
 
 private:
+  const SISubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
   bool MixedExpTypes = false;
@@ -735,9 +736,12 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
   const int32_t LB = getScoreLB(T);
   const int32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if (T == VM_CNT && hasPendingFlat()) {
-      // If there is a pending FLAT operation, and this is a VM waitcnt,
-      // then we need to force a waitcnt 0 for VM.
+    if ((T == VM_CNT || T == LGKM_CNT) &&
+        hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
@@ -1200,7 +1204,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
           if (!ScoreBracket) {
             assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>();
+                llvm::make_unique<BlockWaitcntBrackets>(ST);
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
@@ -1879,7 +1883,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
author	Mark Searles <m.c.searles@gmail.com>	2018-06-04 16:51:59 +0000
committer	Mark Searles <m.c.searles@gmail.com>	2018-06-04 16:51:59 +0000
commit	f0b93f1e9e649dde693c40ce4fc929fba068abae (patch)
tree	43f6f7511a6824b1eed456d8eafa0160d48e4234 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent	7c000d426795332ba1c11bb8798c891fb6687279 (diff)
download	bcm5719-llvm-f0b93f1e9e649dde693c40ce4fc929fba068abae.tar.gz bcm5719-llvm-f0b93f1e9e649dde693c40ce4fc929fba068abae.zip