summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2019-06-20 20:54:32 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2019-06-20 20:54:32 +0000
commit8ad1decf453e299dd7d011a218212dc6480ea6fd (patch)
tree0c1c16b815d16670eed781859e6a4cb1786f4059 /llvm/lib/Target/AMDGPU
parent9589db7a98ea78a2285133fa5b5e4927f6920539 (diff)
downloadbcm5719-llvm-8ad1decf453e299dd7d011a218212dc6480ea6fd.tar.gz
bcm5719-llvm-8ad1decf453e299dd7d011a218212dc6480ea6fd.zip
AMDGPU: Insert mem_viol check loop around GWS pre-GFX9
It is necessary to emit this loop around GWS operations in case the wave is preempted pre-GFX9. llvm-svn: 363979
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h9
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp132
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h3
5 files changed, 129 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 3b1ed618677..e1be0105aea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -715,6 +715,15 @@ public:
return getGeneration() < GFX9;
}
+ // True if the hardware rewinds and replays GWS operations if a wave is
+ // preempted.
+ //
+ // If this is false, a GWS operation requires testing if a nack set the
+ // MEM_VIOL bit, and repeating if so.
+ bool hasGWSAutoReplay() const {
+ return getGeneration() >= GFX9;
+ }
+
bool hasAddNoCarry() const {
return AddNoCarryInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 248f6599f7e..0ef5d79d66a 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -467,7 +467,7 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
-let isConvergent = 1 in {
+let isConvergent = 1, usesCustomInserter = 1 in {
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
let mayLoad = 0;
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index c21f88f52d1..cc96f1de43a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -323,6 +323,8 @@ enum Offset : unsigned { // Offset, (5) [10:6]
OFFSET_WIDTH_ = 5,
OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+ OFFSET_MEM_VIOL = 8,
+
OFFSET_SRC_SHARED_BASE = 16,
OFFSET_SRC_PRIVATE_BASE = 0
};
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a159f208aca..fbe88ca2246 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2922,6 +2922,109 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
return SplitBB;
}
+// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
+// \p MI will be the only instruction in the loop body block. Otherwise, it will
+// be the first instruction in the remainder block.
+//
+/// \returns { LoopBody, Remainder }
+static std::pair<MachineBasicBlock *, MachineBasicBlock *>
+splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock::iterator I(&MI);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ if (InstInLoop) {
+ auto Next = std::next(I);
+
+ // Move instruction to loop body.
+ LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
+
+ // Move the rest of the block.
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
+ } else {
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ }
+
+ MBB.addSuccessor(LoopBB);
+
+ return std::make_pair(LoopBB, RemainderBB);
+}
+
+MachineBasicBlock *
+SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
+
+ MachineBasicBlock::iterator I = LoopBB->end();
+
+ MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ assert(Src && "missing operand from GWS instruction");
+
+ const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
+ AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+
+ // Clear TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(0)
+ .addImm(EncodedReg);
+
+ // This is a pain, but we're not allowed to have physical register live-ins
+ // yet. Insert a pair of copies if the VGPR0 hack is necessary.
+ if (TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
+ unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
+ .add(*Src);
+
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
+ .addReg(Data0);
+
+ MRI.setSimpleHint(Data0, Src->getReg());
+ }
+
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Load and check TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
+ .addImm(EncodedReg);
+
+ // FIXME: Do we need to use an isel pseudo that may clobber scc?
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(Reg, RegState::Kill)
+ .addImm(0);
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(LoopBB);
+
+ return RemainderBB;
+}
+
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
// wavefront. If the value is uniform and just happens to be in a VGPR, this
// will only do one iteration. In the worst case, this will loop 64 times.
@@ -3061,24 +3164,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
.addReg(Exec);
- // To insert the loop we need to split the block. Move everything after this
- // point to a new block, and insert a new empty block between the two.
- MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
- MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, LoopBB);
- MF->insert(MBBI, RemainderBB);
-
- LoopBB->addSuccessor(LoopBB);
- LoopBB->addSuccessor(RemainderBB);
-
- // Move the rest of the block into a new block.
- RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-
- MBB.addSuccessor(LoopBB);
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
@@ -3630,6 +3718,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::DS_GWS_INIT:
+ case AMDGPU::DS_GWS_SEMA_V:
+ case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_SEMA_P:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (getSubtarget()->hasGWSAutoReplay())
+ return BB;
+ return emitGWSMemViolTestLoop(MI, BB);
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 7d9c05d81ad..b3762dc3483 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -313,6 +313,9 @@ public:
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
OpenPOWER on IntegriCloud