summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp34
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir23
2 files changed, 28 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a6ce7aaa6cb..8b5b24a1f50 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1125,7 +1125,8 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
if (EmitSwaitcnt != 0) {
MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
- if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+ if (!OldWaitcnt ||
+ (AMDGPU::decodeVmcnt(IV, Imm) !=
(CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
(AMDGPU::decodeExpcnt(IV, Imm) !=
(CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1143,7 +1144,7 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}
ScoreBracket->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
+ DEBUG(dbgs() << "set-revisit: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
@@ -1151,18 +1152,14 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
// Update an existing waitcount, or make a new one.
unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
CntVal[EXP_CNT], CntVal[LGKM_CNT]);
- // We don't (yet) track waitcnts that existed prior to the waitcnt
- // pass (we just skip over them); because the waitcnt pass is ignorant
- // of them, it may insert a redundant waitcnt. To avoid this, check
- // the prev instr. If it and the to-be-inserted waitcnt are the
- // same, keep the prev waitcnt and skip the insertion. We assume that
- // whomever. e.g., for memory model, inserted the prev waitcnt really
- // wants it there.
+ // We don't remove waitcnts that existed prior to the waitcnt
+ // pass. Check if the waitcnt to-be-inserted can be avoided
+ // or if the prev waitcnt can be updated.
bool insertSWaitInst = true;
for (MachineBasicBlock::iterator I = MI.getIterator(),
B = MI.getParent()->begin();
insertSWaitInst && I != B; --I) {
- if (I == MI.getIterator())
+ if (I == MI.getIterator())
continue;
switch (I->getOpcode()) {
@@ -1585,15 +1582,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineInstr &Inst = *Iter;
// Remove any previously existing waitcnts.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- // TODO: Register the old waitcnt and optimize the following waitcnts.
- // Leaving the previously existing waitcnts is conservatively correct.
+ // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+ // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+ // as needed.
if (!TrackedWaitcntSet.count(&Inst))
++Iter;
else {
- ScoreBrackets->setWaitcnt(&Inst);
++Iter;
Inst.removeFromParent();
}
+ ScoreBrackets->setWaitcnt(&Inst);
continue;
}
@@ -1684,8 +1682,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (WaitcntData->getIterCnt() > 2) {
// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.
- // As a simplification, Instead of tracking individual scores and
- // generate the precise wait count, just wait on 0.
+ // As a simplification, instead of tracking individual scores and
+ // generating the precise wait count, just wait on 0.
bool HasPending = false;
MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1722,7 +1720,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Add this waitcnt to the block. It is either newly created or
// created in previous iterations and added back since block traversal
- // always remove waitcnt.
+ // always removes waitcnts.
insertWaitcntBeforeCF(Block, SWaitInst);
WaitcntData->setWaitcnt(SWaitInst);
}
@@ -1788,7 +1786,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
(!BlockWaitcntProcessedSet.count(&MBB))) {
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- DEBUG(dbgs() << "set-revisit: block"
+ DEBUG(dbgs() << "set-revisit: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
@@ -1819,7 +1817,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->incIterCnt();
- DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+ DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
continue;
} else {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
index 188f0151a70..92e73f6db39 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
@@ -1,24 +1,25 @@
# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
# Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
-# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
-# or S_WAITCNT 3952
+# In this testcase, ensure that pass does not insert redundant S_WAITCNT 3952
...
# CHECK-LABEL: name: waitcnt-no-redundant
-# CHECK: DS_READ_B64
-# CHECK-NEXT: S_WAITCNT 127
+# CHECK: S_WAITCNT 3952
# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
# CHECK-NEXT: S_WAITCNT 3952
-# CHECK-NEXT: BUFFER_WBINVL1_VOL
+# CHECK-NEXT: BUFFER_WBINVL1
name: waitcnt-no-redundant
body: |
- bb.0:
- renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
- S_WAITCNT 127
- FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+ bb.0:
+ renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+ bb.1:
+ S_WAITCNT 3952
+ FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 3952
- BUFFER_WBINVL1_VOL implicit $exec
- S_ENDPGM
+ BUFFER_WBINVL1 implicit $exec
+ S_BRANCH %bb.1
+
...
OpenPOWER on IntegriCloud