diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-03-29 21:30:06 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-03-29 21:30:06 +0000 |
commit | 03ae399d50890edd031f8d889a10fa36cee8d101 (patch) | |
tree | 0c04c7a253603c951701b47d08099ebfd07f9a42 /llvm/lib | |
parent | 50635dab263c96a8b8ccde24f2fc09ceffe5ef20 (diff) | |
download | bcm5719-llvm-03ae399d50890edd031f8d889a10fa36cee8d101.tar.gz bcm5719-llvm-03ae399d50890edd031f8d889a10fa36cee8d101.zip |
AMDGPU: Support realigning stack
While the stack access instructions don't care about
alignment > 4, some transformations on the pointer calculation
do make assumptions based on knowing the low bits of a pointer
are 0. If a stack object ends up being accessed through its
absolute address (relative to the kernel scratch wave offset),
the addressing expression may depend on the stack frame being
properly aligned. This was breaking in a testcase due to the
add->or combine.
I think some of the SP/FP handling logic is still backwards,
and overly simplistic to support all of the stack features.
Code which tries to modify the SP with inline asm for example
or variable sized objects will probably require redoing this.
llvm-svn: 328831
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 82 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 9 |
3 files changed, 85 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 9e35c5f7340..d64e6555d03 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -587,6 +587,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index a3f107b10fc..2eaace44c41 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -13,6 +13,7 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -487,9 +488,43 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, } } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + + const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); if (FuncInfo->isEntryFunction()) { emitEntryFunctionPrologue(MF, MBB); return; @@ -498,6 +533,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const MachineFrameInfo &MFI = MF.getFrameInfo(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -505,8 +541,36 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; + // XXX - Is this the right predicate? + bool NeedFP = hasFP(MF); - if (NeedFP) { + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = NumBytes; + const bool NeedsRealignment = TRI.needsStackRealignment(MF); + + if (NeedsRealignment) { + assert(NeedFP); + const unsigned Alignment = MFI.getMaxAlignment(); + const unsigned ZeroLowBits = countTrailingZeros(Alignment); + assert(ZeroLowBits > 1); + + RoundedSize += Alignment; + + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); + assert(ScratchSPReg != AMDGPU::NoRegister); + + // s_add_u32 tmp_reg, s32, NumBytes + // s_and_b32 s32, tmp_reg, 0b111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + FuncInfo->setIsStackRealigned(true); + } else if (NeedFP) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -516,11 +580,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - uint32_t NumBytes = MFI.getStackSize(); - if (NumBytes != 0 && hasSP(MF)) { + if (RoundedSize != 0 && hasSP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) + .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } @@ -566,10 +629,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, // it's really whether we need SP to be accurate or not. if (NumBytes != 0 && hasSP(MF)) { + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addImm(RoundedSize * ST.getWavefrontSize()); } } @@ -759,7 +824,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { } bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); // All stack operations are relative to the frame offset SGPR. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects(); + return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 781eea0f94b..8c38cdae5d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -142,6 +142,7 @@ private: bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; + bool IsStackRealigned = false; unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; @@ -495,6 +496,14 @@ public: HasNonSpillStackObjects = StackObject; } + bool isStackRealigned() const { + return IsStackRealigned; + } + + void setIsStackRealigned(bool Realigned = true) { + IsStackRealigned = Realigned; + } + unsigned getNumSpilledSGPRs() const { return NumSpilledSGPRs; } |