diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 508 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 17 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 34 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 8 |
6 files changed, 436 insertions, 142 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 7b3105b6790..25d6b2fd7c4 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,6 +21,9 @@ using namespace llvm; +#define DEBUG_TYPE "frame-info" + + static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), @@ -33,6 +36,150 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, ST.getMaxNumSGPRs(MF)); } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + if (Unused) { + // We are looking for a register that can be used throughout the entire + // function, so any use is unacceptable. + for (unsigned Reg : RC) { + if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + return Reg; + } + } else { + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + } + + // If we require an unused register, this is used in contexts where failure is + // an option and has an alternative plan. In other contexts, this must + // succeed0. + if (!Unused) + report_fatal_error("failed to find free scratch register"); + + return AMDGPU::NoRegister; +} + +static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { + LivePhysRegs LiveRegs; + LiveRegs.init(*MRI.getTargetRegisterInfo()); + return findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +} + +// We need to specially emit stack operations here because a different frame +// register is used than in the rest of the function, as getFrameRegister would +// use. +static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + +static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { @@ -511,35 +658,6 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } } -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); - for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); - - for (unsigned Reg : RC) { - if (LiveRegs.available(MRI, Reg)) - return Reg; - } - - return AMDGPU::NoRegister; -} - bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { switch (ID) { case TargetStackID::Default: @@ -559,6 +677,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); @@ -573,20 +692,90 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; + // To avoid clobbering VGPRs in lanes that weren't active on function entry, + // turn on all lanes before doing the spill to memory. + unsigned ScratchExecCopy = AMDGPU::NoRegister; + + // Emit the copy if we need an FP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + + if (ScratchExecCopy == AMDGPU::NoRegister) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } + + ScratchExecCopy + = findScratchNonCalleeSaveRegister(MRI, LiveRegs, + *TRI.getWaveMaskRegClass()); + assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); + + const unsigned OrSaveExec = ST.isWave32() ? + AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), + ScratchExecCopy) + .addImm(-1); + } + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + StackPtrReg, + Reg.FI.getValue()); + } + + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI) && + MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + + // Save FP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } if (TRI.needsStackRealignment(MF)) { HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - - unsigned ScratchSPReg - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg != AMDGPU::NoRegister); + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); + assert(ScratchSPReg != AMDGPU::NoRegister && + ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -616,44 +805,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - // To avoid clobbering VGPRs in lanes that weren't active on function entry, - // turn on all lanes before doing the spill to memory. - unsigned ScratchExecCopy = AMDGPU::NoRegister; - - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - - if (ScratchExecCopy == AMDGPU::NoRegister) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - } - - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - *TRI.getWaveMaskRegClass()); - - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), - ScratchExecCopy) - .addImm(-1); - } - - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } + assert(!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + FuncInfo->FramePointerSaveIndex) && + "Needed to save FP but didn't save it anywhere"); - if (ScratchExecCopy != AMDGPU::NoRegister) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy); - } + assert(HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + !FuncInfo->FramePointerSaveIndex) && + "Saved FP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -664,9 +822,45 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + LivePhysRegs LiveRegs; DebugLoc DL; + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + + if (RoundedSize != 0 && hasFP(MF)) { + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } + + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(!MFI.isDeadObjectIndex(FI)); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FuncInfo->getFrameOffsetReg()) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + unsigned ScratchExecCopy = AMDGPU::NoRegister; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { @@ -676,24 +870,26 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const SIRegisterInfo &TRI = TII->getRegisterInfo(); if (ScratchExecCopy == AMDGPU::NoRegister) { // See emitPrologue - LivePhysRegs LiveRegs(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); + if (LiveRegs.empty()) { + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - *TRI.getWaveMaskRegClass()); + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + LiveRegs.removeReg(ScratchExecCopy); - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) .addImm(-1); } - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); } if (ScratchExecCopy != AMDGPU::NoRegister) { @@ -701,25 +897,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy); - } - - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; - - if (RoundedSize != 0 && hasFP(MF)) { - const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addReg(ScratchExecCopy, RegState::Kill); } } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not -// memory. +// memory. They should have been removed by now. static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -730,6 +913,23 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { return true; } + +#ifndef NDEBUG +static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, + Optional<int> FramePointerSaveIndex) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I) && + MFI.getStackID(I) == TargetStackID::SGPRSpill && + FramePointerSaveIndex && I != FramePointerSaveIndex) { + return false; + } + } + + return true; +} +#endif + int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); @@ -743,15 +943,12 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return; - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + assert(allSGPRSpillsAreDead(MFI, None) && + "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source @@ -761,12 +958,12 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (FuncInfo->isEntryFunction()) { int ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } else { int ScavengeFI = MFI.CreateStackObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), - TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass), + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), false); RS->addScavengingFrameIndex(ScavengeFI); } @@ -775,17 +972,76 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, - BitVector &SavedRegs, + BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); - // VGPRs used for SGPR spilling need to be specially inserted in the prolog. - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // Ignore the SGPRs the default implementation found. + SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // hasFP only knows about stack objects that already exist. We're now + // determining the stack slots that will be created, so we have to predict + // them. Stack objects force FP usage with calls. + // + // Note a new VGPR CSR may be introduced if one is used for the spill, but we + // don't want to report it here. + // + // FIXME: Is this really hasReservedCallFrame? + const bool WillHaveFP = + FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + + // VGPRs used for SGPR spilling need to be specially inserted in the prolog, + // so don't allow the default insertion to handle them. for (auto SSpill : MFI->getSGPRSpillVGPRs()) - SavedRegs.reset(SSpill.VGPR); + SavedVGPRs.reset(SSpill.VGPR); + + const bool HasFP = WillHaveFP || hasFP(MF); + if (!HasFP) + return; + + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + + // If there is already a VGPR with free lanes, use it. We may already have + // to pay the penalty for spilling a CSR VGPR. + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n'); + return; + } + + MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); + + if (!MFI->SGPRForFPSaveRestoreCopy) { + // There's no free lane to spill, and no free register to save FP, so we're + // forced to spill another VGPR to use for the spill. + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving FP with copy to " << + printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + } } void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, @@ -802,6 +1058,27 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); } +bool SIFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (!FuncInfo->SGPRForFPSaveRestoreCopy) + return false; + + for (auto &CS : CSI) { + if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + break; + } + } + + return false; +} + MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, @@ -841,6 +1118,9 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { if (MFI.hasCalls()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. + + // FIXME: This function is pretty broken, since it can be called before the + // frame layout is determined or CSR spills are inserted. if (MFI.getStackSize() != 0) return true; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 3e4260f9ed4..19543287148 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -38,6 +38,10 @@ public: RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; bool isSupportedStackID(TargetStackID::Value ID) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2360eaced01..0be746cf2b8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2694,8 +2694,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; - SDValue CallerSavedFP; - // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { @@ -2708,15 +2706,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); - - if (!Info->isEntryFunction()) { - // Avoid clobbering this function's FP value. In the current convention - // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); - CopyFromChains.push_back(CallerSavedFP.getValue(1)); - } - Chain = DAG.getTokenFactor(DL, CopyFromChains); } @@ -2905,12 +2894,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = Call.getValue(0); InFlag = Call.getValue(1); - if (CallerSavedFP) { - SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); - Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); - InFlag = Chain.getValue(1); - } - uint64_t CalleePopBytes = NumBytes; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 12787f2ce9a..8605932330e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -957,8 +957,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. - - FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -1052,7 +1052,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b73feadd521..3cbd4c3ae13 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -71,7 +71,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; ScratchWaveOffsetReg = AMDGPU::SGPR33; - FrameOffsetReg = AMDGPU::SGPR5; + + // TODO: Pick a high register, and shift down, similar to a kernel.wwwwwwwwwwww + FrameOffsetReg = AMDGPU::SGPR34; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = @@ -245,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { return false; } +/// \p returns true if \p NumLanes slots are available in VGPRs already used for +/// SGPR spilling. +// +// FIXME: This only works after processFunctionBeforeFrameFinalized +bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumNeed) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + unsigned WaveSize = ST.getWavefrontSize(); + return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); +} + /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { @@ -307,13 +320,18 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, } void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { - for (auto &R : SGPRToVGPRSpills) - MFI.RemoveStackObject(R.first); - // All other SPGRs must be allocated on the default stack, so reset - // the stack ID. - for (unsigned i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); - i != e; ++i) - MFI.setStackID(i, 0); + // The FP spill hasn't been inserted yet, so keep it around. + for (auto &R : SGPRToVGPRSpills) { + if (R.first != FramePointerSaveIndex) + MFI.RemoveStackObject(R.first); + } + + // All other SPGRs must be allocated on the default stack, so reset the stack + // ID. + for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; + ++i) + if (i != FramePointerSaveIndex) + MFI.setStackID(i, TargetStackID::Default); } MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 73393634b31..2cbca8930a6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -429,6 +429,12 @@ private: unsigned NumVGPRSpillLanes = 0; SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs; +public: // FIXME + /// If this is set, an SGPR used for save/restore of the register used for the + /// frame pointer. + unsigned SGPRForFPSaveRestoreCopy = 0; + Optional<int> FramePointerSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); @@ -448,6 +454,8 @@ public: return Mode; } + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); |

