diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 170 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h | 4 |
2 files changed, 174 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 42742a649b3..24e0a03667d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -13,9 +13,11 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -328,6 +330,170 @@ static LLT getHalfSizedType(LLT Ty) { return LLT::scalar(Ty.getSizeInBits() / 2); } +/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If +/// any of the required SGPR operands are VGPRs, perform a waterfall loop to +/// execute the instruction for each unique combination of values in all lanes +/// in the wave. The block will be split such that new blocks +void AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator I(MI); + + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + assert(OpIndices.size() == 1 && + "need to implement support for multiple operands"); + + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet<unsigned, 4> SGPROperandRegs; + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + unsigned Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + if (SGPROperandRegs.empty()) + return; + + MachineIRBuilder B(MI); + SmallVector<unsigned, 4> ResultRegs; + SmallVector<unsigned, 4> InitResultRegs; + SmallVector<unsigned, 4> PhiRegs; + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + unsigned InitReg = B.buildUndef(ResTy).getReg(0); + unsigned PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } + + unsigned SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF) + .addDef(InitSaveExecReg); + + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) + .addReg(AMDGPU::EXEC); + + unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + // To insert the loop we need to split the block. Move everything before this + // point to a new block, and insert a new empty block before this instruction. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RestoreExecBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(RestoreExecBB); + LoopBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + B.buildInstr(TargetOpcode::PHI) + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(LoopBB); + + for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { + B.buildInstr(TargetOpcode::G_PHI) + .addDef(std::get<2>(Result)) + .addReg(std::get<0>(Result)) // Initial value / implicit_def + .addMBB(&MBB) + .addReg(std::get<1>(Result)) // Mid-loop value. + .addMBB(LoopBB); + } + + // Move the instruction into the loop. + LoopBB->splice(LoopBB->end(), &MBB, I); + I = std::prev(LoopBB->end()); + + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg()) + continue; + + assert(!Op.isDef()); + if (SGPROperandRegs.count(Op.getReg())) { + unsigned CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, LLT::scalar(32)); // FIXME + + assert(MRI.getType(Op.getReg())== LLT::scalar(32) && + "need to implement support for other types"); + + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); + + // FIXME: Need to and each conditon + + // Compare the just read SGPR value to all possible operand values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(CondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); + } + } + + // Update EXEC, save the original EXEC value to VCC. + B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + .addDef(NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(AMDGPU::S_XOR_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) + .addMBB(LoopBB); + + // Restore the EXEC mask + B.buildInstr(AMDGPU::S_MOV_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(SaveExecReg); +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -436,6 +602,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } + case AMDGPU::G_EXTRACT_VECTOR_ELT: + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 18c15cd5efe..6000943b3aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -37,6 +37,10 @@ protected: class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; + void executeInWaterfallLoop(MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; |

