summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2019-05-03 21:53:53 +0000
committerStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2019-05-03 21:53:53 +0000
commitd9dcf392c7bd55823501323a8215dfd831e66e0c (patch)
tree5d0d795366c3525c2714d98b7d49b2579a1d0735 /llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
parent41bbe101a2ca6e65ca3a0abc8054e3d6a64c41c2 (diff)
downloadbcm5719-llvm-d9dcf392c7bd55823501323a8215dfd831e66e0c.tar.gz
bcm5719-llvm-d9dcf392c7bd55823501323a8215dfd831e66e0c.zip
[AMDGPU] gfx1010 wait count insertion
Differential Revision: https://reviews.llvm.org/D61534 llvm-svn: 359938
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp200
1 files changed, 144 insertions, 56 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 31e06fd1c3e..34513d3450a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -100,7 +100,7 @@ public:
#define CNT_MASK(t) (1u << (t))
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
return make_range(enum_iterator<InstCounterType>(VM_CNT),
@@ -113,6 +113,7 @@ struct {
uint32_t VmcntMax;
uint32_t ExpcntMax;
uint32_t LgkmcntMax;
+ uint32_t VscntMax;
int32_t NumVGPRsMax;
int32_t NumSGPRsMax;
} HardwareLimits;
@@ -126,6 +127,8 @@ struct {
enum WaitEventType {
VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS,// vector-memory write
LDS_ACCESS, // lds read & write
GDS_ACCESS, // gds read & write
SQ_MESSAGE, // send message
@@ -139,11 +142,12 @@ enum WaitEventType {
};
static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS),
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS)
};
// The mapping is:
@@ -171,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
case LGKM_CNT:
Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
break;
+ case VS_CNT:
+ Wait.VsCnt = std::min(Wait.VsCnt, Count);
+ break;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -199,6 +206,8 @@ public:
return HardwareLimits.LgkmcntMax;
case EXP_CNT:
return HardwareLimits.ExpcntMax;
+ case VS_CNT:
+ return HardwareLimits.VscntMax;
default:
break;
}
@@ -221,10 +230,12 @@ public:
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
- if (E == VMEM_ACCESS)
+ if (WaitEventMaskForInst[VM_CNT] & (1 << E))
return VM_CNT;
if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
return LGKM_CNT;
+ if (WaitEventMaskForInst[VS_CNT] & (1 << E))
+ return VS_CNT;
assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
return EXP_CNT;
}
@@ -665,6 +676,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
case EXP_CNT:
OS << " EXP_CNT(" << UB - LB << "): ";
break;
+ case VS_CNT:
+ OS << " VS_CNT(" << UB - LB << "): ";
+ break;
default:
OS << " UNKNOWN(" << UB - LB << "): ";
break;
@@ -704,7 +718,8 @@ void WaitcntBrackets::print(raw_ostream &OS) {
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -747,6 +762,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(VM_CNT, Wait.VmCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ applyWaitcnt(VS_CNT, Wait.VsCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -817,7 +833,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// TODO: Handle other cases of NeedsWaitcntVmBefore()
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+ MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
Wait.VmCnt = 0;
}
@@ -826,7 +844,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
- Wait = AMDGPU::Waitcnt::allZero(IV);
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -998,7 +1016,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- Wait = AMDGPU::Waitcnt::allZero(IV);
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1016,15 +1034,25 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
bool Modified = false;
if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- Modified = true;
- } else {
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, ++NextI) {
+ if (II->isDebugInstr())
+ continue;
+
+ if (TrackedWaitcntSet.count(&*II)) {
+ TrackedWaitcntSet.erase(&*II);
+ II->eraseFromParent();
+ Modified = true;
+ } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ int64_t Imm = II->getOperand(0).getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ ScoreBrackets.applyWaitcnt(
+ AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
+ }
}
- Modified = true;
}
return Modified;
}
@@ -1038,39 +1066,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Wait.ExpCnt = 0;
if (ForceEmitWaitcnt[LGKM_CNT])
Wait.LgkmCnt = 0;
+ if (ForceEmitWaitcnt[VS_CNT])
+ Wait.VsCnt = 0;
ScoreBrackets.applyWaitcnt(Wait);
AMDGPU::Waitcnt OldWait;
+ bool Modified = false;
+
if (OldWaitcntInstr) {
- OldWait =
- AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
- }
- if (OldWait.dominates(Wait))
- return false;
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, NextI++) {
+ if (II->isDebugInstr())
+ continue;
- if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
- Wait = Wait.combined(OldWait);
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ OldWait = OldWait.combined(IWait);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait = Wait.combined(IWait);
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ if (IEnc != NewEnc) {
+ II->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+ unsigned ICnt = II->getOperand(1).getImm();
+ OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
+ if (Wait.VsCnt != ICnt) {
+ II->getOperand(1).setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ Wait.VsCnt = ~0u;
+ }
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (OldWaitcntInstr) {
- OldWaitcntInstr->getOperand(0).setImm(Enc);
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *II << '\n');
- LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
- << "Old Instr: " << MI << '\n'
- << "New Instr: " << *OldWaitcntInstr << '\n');
- } else {
+ if (!Wait.hasWait())
+ return Modified;
+ }
+ }
+
+ if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
<< "Old Instr: " << MI << '\n'
<< "New Instr: " << *SWaitInst << '\n');
}
- return true;
+ if (Wait.VsCnt != ~0u) {
+ assert(ST->hasVscnt());
+
+ auto SWaitInst =
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
+ TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
}
// This is a flat memory operation. Check to see if it has memory
@@ -1105,8 +1182,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (TII->isFLAT(Inst)) {
assert(Inst.mayLoad() || Inst.mayStore());
- if (TII->usesVM_CNT(Inst))
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ if (TII->usesVM_CNT(Inst)) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if (Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+ }
if (TII->usesLGKM_CNT(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
@@ -1121,8 +1205,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// TODO: get a better carve out.
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if ((Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+ /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
+ (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else if (Inst.mayStore())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+
if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
@@ -1243,27 +1338,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Remove any previously existing waitcnts.
- if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- OldWaitcntInstr = nullptr;
- } else if (!TrackedWaitcntSet.count(&Inst)) {
- // Two successive s_waitcnt's, both of which are pre-existing and
- // are therefore preserved.
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- ++Iter;
- Inst.eraseFromParent();
- Modified = true;
- continue;
- }
- }
-
- OldWaitcntInstr = &Inst;
+ // Track pre-existing waitcnts from earlier iterations.
+ if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
+ (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
+ if (!OldWaitcntInstr)
+ OldWaitcntInstr = &Inst;
++Iter;
continue;
}
@@ -1320,7 +1401,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
- BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+ BuildMI(Block, Inst, Inst.getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B64),
AMDGPU::VCC)
.addReg(AMDGPU::VCC);
VCCZBugHandledSet.insert(&Inst);
@@ -1348,6 +1430,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+ HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
@@ -1483,6 +1566,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
+ if (ST->hasVscnt())
+ BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(0);
OpenPOWER on IntegriCloud