diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-06-05 22:20:47 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-06-05 22:20:47 +0000 |
commit | b812b7a45ed159fcc4b1b26f9200885d93b68fc5 (patch) | |
tree | 24ec293bca8be1a5b15f1e58c37f2615b2e667ab /llvm/lib | |
parent | 8f500a6f9ca0e56e41513435d0257c84ddabb566 (diff) | |
download | bcm5719-llvm-b812b7a45ed159fcc4b1b26f9200885d93b68fc5.tar.gz bcm5719-llvm-b812b7a45ed159fcc4b1b26f9200885d93b68fc5.zip |
AMDGPU: Invert frame index offset interpretation
Since the beginning, the offset of a frame index has been consistently
interpreted backwards. It was treating it as an offset from the
scratch wave offset register as a frame register. The correct
interpretation is the offset from the SP on entry to the function,
before the prolog. Frame index elimination then should select either
SP or another register as an FP.
Treat the scratch wave offset on kernel entry as the pre-incremented
SP. Rely more heavily on the standard hasFP and frame pointer
elimination logic, and clean up the private reservation code. This
saves a copy in most callee functions.
The kernel prolog emission code is still kind of a mess relying on
checking the uses of physical registers, which I would prefer to
eliminate.
Currently selection directly emits MUBUF instructions, which require
using a reference to some register. Use the register chosen for SP,
and then ignore this later. This should probably be cleaned up to use
pseudos that don't refer to any specific base register until frame
index elimination.
Add a workaround for shaders using large numbers of SGPRs. I'm not
sure these cases were ever working correctly, since as far as I can
tell the logic for figuring out which SGPR is the scratch wave offset
doesn't match up with the shader input initialization in the shader
programming guide.
llvm-svn: 362661
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 175 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 142 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 61 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 2 |
9 files changed, 218 insertions, 209 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 59a27ab1401..2c104758047 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1361,10 +1361,10 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - // If we can resolve this to a frame index access, this is relative to the - // frame pointer SGPR. - return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), - MVT::i32)); + // If we can resolve this to a frame index access, this will be relative to + // either the stack or frame pointer SGPR. + return std::make_pair( + TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); } // If we don't know this private access is a local stack object, it needs to diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 60f23f7d778..140ca6e33fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -83,6 +83,9 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, } unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const SIFrameLowering *TFI = + MF.getSubtarget<GCNSubtarget>().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - return FuncInfo->getFrameOffsetReg(); + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() + : FuncInfo->getStackPtrOffsetReg(); } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 4b2124b14c0..7f3150bdd01 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -164,34 +164,29 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset and stack pointer -// SGPRs. -std::pair<unsigned, unsigned> -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch wave offset. +unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(MFI->isEntryFunction()); + // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); - return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { + return AMDGPU::NoRegister; } - unsigned SPReg = MFI->getStackPtrOffsetReg(); if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -212,7 +207,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; bool HandledScratchWaveOffsetReg = ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); @@ -225,14 +220,20 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( HandledScratchWaveOffsetReg = true; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); + } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); ScratchWaveOffsetReg = Reg; break; } } } - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return ScratchWaveOffsetReg; } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, @@ -265,38 +266,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::SP_REG) { - assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); - - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - if (StackSize == 0) { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } - unsigned ScratchRsrcReg = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); - unsigned ScratchWaveOffsetReg; - std::tie(ScratchWaveOffsetReg, SPReg) - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - // It's possible to have uses of only ScratchWaveOffsetReg without - // ScratchRsrcReg if it's only used for the initialization of flat_scratch, - // but the inverse is not true. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { - assert(ScratchRsrcReg == AMDGPU::NoRegister); - return; - } + unsigned ScratchWaveOffsetReg = + getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( @@ -308,18 +282,19 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchWaveOffsetReg); bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && MRI.isPhysRegUsed(ScratchRsrcReg); + // FIXME: Hack to not crash in situations which emitted an error. + if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + return; + // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - if (OffsetRegUsed) { - assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && - "scratch wave offset input is required"); - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - } + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); @@ -360,11 +335,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (OffsetRegUsed && - PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + + // FIXME: Remove the isPhysRegUsed checks + const bool HasFP = hasFP(MF); + + if (HasFP || OffsetRegUsed) { + assert(ScratchWaveOffsetReg); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, - MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); } if (CopyBuffer && !CopyBufferFirst) { @@ -372,9 +352,26 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed) + if (ResourceRegUsed) { emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, PreloadedPrivateBufferReg, ScratchRsrcReg); + } + + if (HasFP) { + DebugLoc DL; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); + + // On kernel entry, the private scratch wave offset is the SP value. + if (StackSize == 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. @@ -567,15 +564,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; - // XXX - Is this the right predicate? - - bool NeedFP = hasFP(MF); + bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; - const bool NeedsRealignment = TRI.needsStackRealignment(MF); - if (NeedsRealignment) { - assert(NeedFP); + if (TRI.needsStackRealignment(MF)) { + HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; @@ -599,7 +593,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(-Alignment * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); - } else if (NeedFP) { + } else if ((HasFP = hasFP(MF))) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -609,7 +603,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (RoundedSize != 0 && hasSP(MF)) { + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) @@ -693,23 +687,17 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, .addReg(ScratchExecCopy); } - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - if (StackPtrReg == AMDGPU::NoRegister) - return; - - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); - - // FIXME: Clarify distinction between no set SP and SP. For callee functions, - // it's really whether we need SP to be accurate or not. - - if (NumBytes != 0 && hasSP(MF)) { + if (hasFP(MF)) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = FuncInfo->isStackRealigned() ? NumBytes + MFI.getMaxAlignment() : NumBytes; + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()); + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); } } @@ -849,18 +837,25 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( } bool SIFrameLowering::hasFP(const MachineFunction &MF) const { - // All stack operations are relative to the frame offset SGPR. - // TODO: Still want to eliminate sometimes. const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasCalls()) { + // All offsets are unsigned, so need to be addressed in the same direction + // as stack growth. + if (MFI.getStackSize() != 0) + return true; + + // For the entry point, the input wave scratch offset must be copied to the + // API SP if there are calls. + if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) + return true; + + // Retain behavior of always omitting the FP for leaf functions when + // possible. + if (MF.getTarget().Options.DisableFramePointerElim(MF)) + return true; + } - // XXX - Is this only called after frame is finalized? Should be able to check - // frame size. - return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); -} - -bool SIFrameLowering::hasSP(const MachineFunction &MF) const { - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); - // All stack operations are relative to the frame offset SGPR. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); + return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF); } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index c5b707cba06..a9e765aa36e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -58,12 +58,9 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; + unsigned getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const; // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, @@ -73,7 +70,6 @@ private: public: bool hasFP(const MachineFunction &MF) const override; - bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1ca11da247e..8a08bc463da 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1770,6 +1770,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); bool HasStackObjects = MFI.hasStackObjects(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1785,65 +1786,85 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.isAmdHsaOrMesa(MF.getFunction())) { - if (RequiresStackAccess) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. - - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - - if (MFI.hasCalls()) { - // If we have calls, we need to keep the frame register in a register - // that won't be clobbered by a call, so ensure it is copied somewhere. - - // This is not a problem for the scratch wave offset, because the same - // registers are reserved in all functions. - - // FIXME: Nothing is really ensuring this is a call preserved register, - // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } else { - unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); - } - } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - - // We tentatively reserve the last registers (skipping the last two - // which may contain VCC). After register allocation, we'll replace - // these with the ones immediately after those which were really - // allocated. In the prologue copies will be inserted from the argument - // to these reserved registers. - Info.setScratchRSrcReg(ReservedBufferReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); + } - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); + // This should be accurate for kernels even before the frame is finalized. + const bool HasFP = ST.getFrameLowering()->hasFP(MF); + if (HasFP) { + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); + + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; + } + } + + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); } + + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); + } else if (RequiresStackAccess) { + assert(!MFI.hasCalls()); + // We know there are accesses and they will be done relative to SP, so just + // pin it to the input. + // + // FIXME: Should not do this if inline asm is reading/writing these + // registers. + unsigned PreloadedSP = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + Info.setStackPtrOffsetReg(PreloadedSP); + Info.setScratchWaveOffsetReg(PreloadedSP); + Info.setFrameOffsetReg(PreloadedSP); + } else { + assert(!MFI.hasCalls()); + + // There may not be stack access at all. There may still be spills, or + // access of a constant pointer (in which cases an extra copy will be + // emitted in the prolog). + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedOffsetReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -9939,7 +9960,6 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { @@ -9947,24 +9967,10 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } - // We have to assume the SP is needed in case there are calls in the function - // during lowering. Calls are only detected after the function is - // lowered. We're about to reserve registers, so don't bother using it if we - // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); - - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); - Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); - - assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); - assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), - Info->getStackPtrOffsetReg())); - if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) - MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - } + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); // We need to worry about replacing the default register with itself in case // of MIR testcases missing the MFI. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1c3c52ba02c..48257b01b86 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -928,7 +928,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. @@ -950,7 +950,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } @@ -1032,7 +1032,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. @@ -1046,10 +1046,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 7e09f41aa8d..bfe6182a7c1 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -444,7 +444,8 @@ public: } unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { - return ArgInfo.getPreloadedValue(Value).first->getRegister(); + auto Arg = ArgInfo.getPreloadedValue(Value).first; + return Arg ? Arg->getRegister() : 0; } unsigned getGITPtrHigh() const { @@ -486,6 +487,11 @@ public: return FrameOffsetReg; } + void setFrameOffsetReg(unsigned Reg) { + assert(Reg != 0 && "Should never be unset"); + FrameOffsetReg = Reg; + } + void setStackPtrOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; @@ -502,8 +508,6 @@ public: void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); ScratchWaveOffsetReg = Reg; - if (isEntryFunction()) - FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 2e96b986667..520d5198c5f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -138,11 +138,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( return AMDGPU::SGPR_32RegClass.getRegister(Reg); } -unsigned SIRegisterInfo::reservedStackPtrOffsetReg( - const MachineFunction &MF) const { - return AMDGPU::SGPR32; -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -718,6 +713,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && OnlyToVGPR) return false; + unsigned FrameReg = getFrameRegister(*MF); + assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); @@ -777,11 +774,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -849,11 +846,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -909,6 +906,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, unsigned EltSize = 4; unsigned ScalarLoadOp; + unsigned FrameReg = getFrameRegister(*MF); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be @@ -940,11 +939,11 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } auto MIB = @@ -988,10 +987,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = @@ -1056,6 +1055,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); + unsigned FrameReg = getFrameRegister(*MF); + switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: @@ -1091,11 +1092,14 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1112,12 +1116,14 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V512_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1129,13 +1135,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const DebugLoc &DL = MI->getDebugLoc(); bool IsMUBUF = TII->isMUBUF(*MI); - if (!IsMUBUF && - MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to an absolute stack address by finding the offset from the // scratch wave base and scaling by the wave size. // - // In an entry function/kernel the stack address is already the - // absolute address relative to the scratch wave offset. + // In an entry function/kernel the offset is already the absolute + // address relative to the frame register. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1146,7 +1151,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addReg(MFI->getScratchWaveOffsetReg()); int64_t Offset = FrameInfo.getObjectOffset(Index); @@ -1196,8 +1201,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() - == MFI->getFrameOffsetReg()); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index de10e92c965..9780824683b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -56,8 +56,6 @@ public: unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; - unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; |