diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-06-20 20:54:32 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-06-20 20:54:32 +0000 |
| commit | 8ad1decf453e299dd7d011a218212dc6480ea6fd (patch) | |
| tree | 0c1c16b815d16670eed781859e6a4cb1786f4059 /llvm/lib/Target/AMDGPU | |
| parent | 9589db7a98ea78a2285133fa5b5e4927f6920539 (diff) | |
| download | bcm5719-llvm-8ad1decf453e299dd7d011a218212dc6480ea6fd.tar.gz bcm5719-llvm-8ad1decf453e299dd7d011a218212dc6480ea6fd.zip | |
AMDGPU: Insert mem_viol check loop around GWS pre-GFX9
It is necessary to emit this loop around GWS operations in case the
wave is preempted pre-GFX9.
llvm-svn: 363979
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 132 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 |
5 files changed, 129 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3b1ed618677..e1be0105aea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -715,6 +715,15 @@ public: return getGeneration() < GFX9; } + // True if the hardware rewinds and replays GWS operations if a wave is + // preempted. + // + // If this is false, a GWS operation requires testing if a nack set the + // MEM_VIOL bit, and repeating if so. + bool hasGWSAutoReplay() const { + return getGeneration() >= GFX9; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 248f6599f7e..0ef5d79d66a 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -467,7 +467,7 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>; defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -let isConvergent = 1 in { +let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { let mayLoad = 0; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index c21f88f52d1..cc96f1de43a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -323,6 +323,8 @@ enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_WIDTH_ = 5, OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + OFFSET_MEM_VIOL = 8, + OFFSET_SRC_SHARED_BASE = 16, OFFSET_SRC_PRIVATE_BASE = 0 }; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a159f208aca..fbe88ca2246 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2922,6 +2922,109 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, return SplitBB; } +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +static std::pair<MachineBasicBlock *, MachineBasicBlock *> +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator I(&MI); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + + if (InstInLoop) { + auto Next = std::next(I); + + // Move instruction to loop body. + LoopBB->splice(LoopBB->begin(), &MBB, I, Next); + + // Move the rest of the block. + RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); + } else { + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + } + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +MachineBasicBlock * +SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + + MachineBasicBlock::iterator I = LoopBB->end(); + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + assert(Src && "missing operand from GWS instruction"); + + const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + + // Clear TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(0) + .addImm(EncodedReg); + + // This is a pain, but we're not allowed to have physical register live-ins + // yet. Insert a pair of copies if the VGPR0 hack is necessary. + if (TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) + .add(*Src); + + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) + .addReg(Data0); + + MRI.setSimpleHint(Data0, Src->getReg()); + } + + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Load and check TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) + .addImm(EncodedReg); + + // FIXME: Do we need to use an isel pseudo that may clobber scc? + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Reg, RegState::Kill) + .addImm(0); + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(LoopBB); + + return RemainderBB; +} + // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the // wavefront. If the value is uniform and just happens to be in a VGPR, this // will only do one iteration. In the worst case, this will loop 64 times. @@ -3061,24 +3164,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) .addReg(Exec); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); @@ -3630,6 +3718,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_BARRIER: + if (getSubtarget()->hasGWSAutoReplay()) + return BB; + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 7d9c05d81ad..b3762dc3483 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -313,6 +313,9 @@ public: MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; |

