summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
diff options
context:
space:
mode:
authorNicolai Haehnle <nhaehnle@gmail.com>2016-04-27 15:46:01 +0000
committerNicolai Haehnle <nhaehnle@gmail.com>2016-04-27 15:46:01 +0000
commitf66bdb5ea8658fdcbac75f96a73e906f380e5741 (patch)
tree2d5dcce4c69fb7cf858793c93b5d7a1dc115413a /llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
parent514f05543f2493450e8265a850292c576d8be3ad (diff)
downloadbcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.tar.gz
bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.zip
AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic
Summary: So it appears that to guarantee some of the ordering requirements of a GLSL memoryBarrier() executed in the shader, we need to emit an s_waitcnt. (We can't use an s_barrier, because memoryBarrier() may appear anywhere in the shader, in particular it may appear in non-uniform control flow.) Reviewers: arsenm, mareko, tstellarAMD Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D19203 llvm-svn: 267729
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaits.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaits.cpp83
1 files changed, 70 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index bf0d6a74336..15884732c12 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -68,6 +68,10 @@ private:
/// \brief Counter values we have already waited on.
Counters WaitedOn;
+ /// \brief Counter values that we must wait on before the next counter
+ /// increase.
+ Counters DelayedWaitOn;
+
/// \brief Counter values for last instruction issued.
Counters LastIssued;
@@ -103,13 +107,17 @@ private:
/// \brief Handle instructions async components
void pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I);
+ MachineBasicBlock::iterator I,
+ const Counters& Increment);
/// \brief Insert the actual wait instruction
bool insertWait(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const Counters &Counts);
+ /// \brief Handle existing wait instructions (from intrinsics)
+ void handleExistingWait(MachineBasicBlock::iterator I);
+
/// \brief Do we need def2def checks?
bool unorderedDefines(MachineInstr &MI);
@@ -287,10 +295,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
}
void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator I,
+ const Counters &Increment) {
// Get the hardware counter increments and sum them up
- Counters Increment = getHwCounts(*I);
Counters Limit = ZeroCounts;
unsigned Sum = 0;
@@ -430,16 +438,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
}
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+ for (unsigned i = 0; i < 3; ++i)
+ if (Counter.Array[i])
+ return true;
+ return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+ assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+ unsigned Imm = I->getOperand(0).getImm();
+ Counters Counts, WaitOn;
+
+ Counts.Named.VM = Imm & 0xF;
+ Counts.Named.EXP = (Imm >> 4) & 0x7;
+ Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+ for (unsigned i = 0; i < 3; ++i) {
+ if (Counts.Array[i] <= LastIssued.Array[i])
+ WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+ else
+ WaitOn.Array[i] = 0;
+ }
+
+ increaseCounters(DelayedWaitOn, WaitOn);
+}
+
Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
Counters Result = ZeroCounts;
- // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
- // but we also want to wait for any other outstanding transfers before
- // signalling other hardware blocks
- if (MI.getOpcode() == AMDGPU::S_SENDMSG)
- return LastIssued;
-
// For each register affected by this instruction increase the result
// sequence.
//
@@ -544,6 +574,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
WaitedOn = ZeroCounts;
+ DelayedWaitOn = ZeroCounts;
LastIssued = ZeroCounts;
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
@@ -552,6 +583,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+ SmallVector<MachineInstr *, 4> RemoveMI;
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
@@ -607,13 +640,34 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
TII->insertWaitStates(MBB, std::next(I), 4);
+ // Record pre-existing, explicitly requested waits
+ if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+ handleExistingWait(*I);
+ RemoveMI.push_back(I);
+ continue;
+ }
+
+ Counters Required;
+
// Wait for everything before a barrier.
- if (I->getOpcode() == AMDGPU::S_BARRIER)
- Changes |= insertWait(MBB, I, LastIssued);
+ //
+ // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+ // but we also want to wait for any other outstanding transfers before
+ // signalling other hardware blocks
+ if (I->getOpcode() == AMDGPU::S_BARRIER ||
+ I->getOpcode() == AMDGPU::S_SENDMSG)
+ Required = LastIssued;
else
- Changes |= insertWait(MBB, I, handleOperands(*I));
+ Required = handleOperands(*I);
+
+ Counters Increment = getHwCounts(*I);
- pushInstruction(MBB, I);
+ if (countersNonZero(Required) || countersNonZero(Increment))
+ increaseCounters(Required, DelayedWaitOn);
+
+ Changes |= insertWait(MBB, I, Required);
+
+ pushInstruction(MBB, I, Increment);
handleSendMsg(MBB, I);
}
@@ -621,5 +675,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
+ for (MachineInstr *I : RemoveMI)
+ I->eraseFromParent();
+
return Changes;
}
OpenPOWER on IntegriCloud