AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic

Summary: So it appears that to guarantee some of the ordering requirements of a GLSL memoryBarrier() executed in the shader, we need to emit an s_waitcnt. (We can't use an s_barrier, because memoryBarrier() may appear anywhere in the shader, in particular it may appear in non-uniform control flow.) Reviewers: arsenm, mareko, tstellarAMD Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D19203 llvm-svn: 267729
author: Nicolai Haehnle <nhaehnle@gmail.com> 2016-04-27 15:46:01 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2016-04-27 15:46:01 +0000
commit: f66bdb5ea8658fdcbac75f96a73e906f380e5741 (patch)
tree: 2d5dcce4c69fb7cf858793c93b5d7a1dc115413a /llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
parent: 514f05543f2493450e8265a850292c576d8be3ad (diff)
download: bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.tar.gz
bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.zip
1 files changed, 70 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index bf0d6a74336..15884732c12 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -68,6 +68,10 @@ private:
   /// \brief Counter values we have already waited on.
   Counters WaitedOn;
 
+  /// \brief Counter values that we must wait on before the next counter
+  /// increase.
+  Counters DelayedWaitOn;
+
   /// \brief Counter values for last instruction issued.
   Counters LastIssued;
 
@@ -103,13 +107,17 @@ private:
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I);
+                       MachineBasicBlock::iterator I,
+                       const Counters& Increment);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   const Counters &Counts);
 
+  /// \brief Handle existing wait instructions (from intrinsics)
+  void handleExistingWait(MachineBasicBlock::iterator I);
+
   /// \brief Do we need def2def checks?
   bool unorderedDefines(MachineInstr &MI);
 
@@ -287,10 +295,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 }
 
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I) {
+                                    MachineBasicBlock::iterator I,
+                                    const Counters &Increment) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(*I);
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
@@ -430,16 +438,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
 
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+  for (unsigned i = 0; i < 3; ++i)
+    if (Counter.Array[i])
+      return true;
+  return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+  unsigned Imm = I->getOperand(0).getImm();
+  Counters Counts, WaitOn;
+
+  Counts.Named.VM = Imm & 0xF;
+  Counts.Named.EXP = (Imm >> 4) & 0x7;
+  Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (Counts.Array[i] <= LastIssued.Array[i])
+      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+    else
+      WaitOn.Array[i] = 0;
+  }
+
+  increaseCounters(DelayedWaitOn, WaitOn);
+}
+
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
   Counters Result = ZeroCounts;
 
-  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-  // but we also want to wait for any other outstanding transfers before
-  // signalling other hardware blocks
-  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
-    return LastIssued;
-
   // For each register affected by this instruction increase the result
   // sequence.
   //
@@ -544,6 +574,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
+  DelayedWaitOn = ZeroCounts;
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -552,6 +583,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 
+  SmallVector<MachineInstr *, 4> RemoveMI;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -607,13 +640,34 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
           I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
         TII->insertWaitStates(MBB, std::next(I), 4);
 
+      // Record pre-existing, explicitly requested waits
+      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        handleExistingWait(*I);
+        RemoveMI.push_back(I);
+        continue;
+      }
+
+      Counters Required;
+
       // Wait for everything before a barrier.
-      if (I->getOpcode() == AMDGPU::S_BARRIER)
-        Changes |= insertWait(MBB, I, LastIssued);
+      //
+      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+      // but we also want to wait for any other outstanding transfers before
+      // signalling other hardware blocks
+      if (I->getOpcode() == AMDGPU::S_BARRIER ||
+          I->getOpcode() == AMDGPU::S_SENDMSG)
+        Required = LastIssued;
       else
-        Changes |= insertWait(MBB, I, handleOperands(*I));
+        Required = handleOperands(*I);
+
+      Counters Increment = getHwCounts(*I);
 
-      pushInstruction(MBB, I);
+      if (countersNonZero(Required) || countersNonZero(Increment))
+        increaseCounters(Required, DelayedWaitOn);
+
+      Changes |= insertWait(MBB, I, Required);
+
+      pushInstruction(MBB, I, Increment);
       handleSendMsg(MBB, I);
     }
 
@@ -621,5 +675,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
   return Changes;
 }
author	Nicolai Haehnle <nhaehnle@gmail.com>	2016-04-27 15:46:01 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2016-04-27 15:46:01 +0000
commit	f66bdb5ea8658fdcbac75f96a73e906f380e5741 (patch)
tree	2d5dcce4c69fb7cf858793c93b5d7a1dc115413a /llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
parent	514f05543f2493450e8265a850292c576d8be3ad (diff)
download	bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.tar.gz bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.zip