diff options
author | Nicolai Haehnle <nhaehnle@gmail.com> | 2016-04-27 15:46:01 +0000 |
---|---|---|
committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2016-04-27 15:46:01 +0000 |
commit | f66bdb5ea8658fdcbac75f96a73e906f380e5741 (patch) | |
tree | 2d5dcce4c69fb7cf858793c93b5d7a1dc115413a /llvm/lib/Target/AMDGPU/SIInsertWaits.cpp | |
parent | 514f05543f2493450e8265a850292c576d8be3ad (diff) | |
download | bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.tar.gz bcm5719-llvm-f66bdb5ea8658fdcbac75f96a73e906f380e5741.zip |
AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic
Summary:
So it appears that to guarantee some of the ordering requirements of a GLSL
memoryBarrier() executed in the shader, we need to emit an s_waitcnt.
(We can't use an s_barrier, because memoryBarrier() may appear anywhere in
the shader, in particular it may appear in non-uniform control flow.)
Reviewers: arsenm, mareko, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D19203
llvm-svn: 267729
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaits.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaits.cpp | 83 |
1 files changed, 70 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index bf0d6a74336..15884732c12 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -68,6 +68,10 @@ private: /// \brief Counter values we have already waited on. Counters WaitedOn; + /// \brief Counter values that we must wait on before the next counter + /// increase. + Counters DelayedWaitOn; + /// \brief Counter values for last instruction issued. Counters LastIssued; @@ -103,13 +107,17 @@ private: /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); + MachineBasicBlock::iterator I, + const Counters& Increment); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Counts); + /// \brief Handle existing wait instructions (from intrinsics) + void handleExistingWait(MachineBasicBlock::iterator I); + /// \brief Do we need def2def checks? bool unorderedDefines(MachineInstr &MI); @@ -287,10 +295,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, } void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator I, + const Counters &Increment) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -430,16 +438,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) { Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } +/// \brief check whether any of the counters is non-zero +static bool countersNonZero(const Counters &Counter) { + for (unsigned i = 0; i < 3; ++i) + if (Counter.Array[i]) + return true; + return false; +} + +void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { + assert(I->getOpcode() == AMDGPU::S_WAITCNT); + + unsigned Imm = I->getOperand(0).getImm(); + Counters Counts, WaitOn; + + Counts.Named.VM = Imm & 0xF; + Counts.Named.EXP = (Imm >> 4) & 0x7; + Counts.Named.LGKM = (Imm >> 8) & 0xF; + + for (unsigned i = 0; i < 3; ++i) { + if (Counts.Array[i] <= LastIssued.Array[i]) + WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + else + WaitOn.Array[i] = 0; + } + + increaseCounters(DelayedWaitOn, WaitOn); +} + Counters SIInsertWaits::handleOperands(MachineInstr &MI) { Counters Result = ZeroCounts; - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - // For each register affected by this instruction increase the result // sequence. // @@ -544,6 +574,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; + DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -552,6 +583,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + SmallVector<MachineInstr *, 4> RemoveMI; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -607,13 +640,34 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) TII->insertWaitStates(MBB, std::next(I), 4); + // Record pre-existing, explicitly requested waits + if (I->getOpcode() == AMDGPU::S_WAITCNT) { + handleExistingWait(*I); + RemoveMI.push_back(I); + continue; + } + + Counters Required; + // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); + // + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (I->getOpcode() == AMDGPU::S_BARRIER || + I->getOpcode() == AMDGPU::S_SENDMSG) + Required = LastIssued; else - Changes |= insertWait(MBB, I, handleOperands(*I)); + Required = handleOperands(*I); + + Counters Increment = getHwCounts(*I); - pushInstruction(MBB, I); + if (countersNonZero(Required) || countersNonZero(Increment)) + increaseCounters(Required, DelayedWaitOn); + + Changes |= insertWait(MBB, I, Required); + + pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } @@ -621,5 +675,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + for (MachineInstr *I : RemoveMI) + I->eraseFromParent(); + return Changes; } |