diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 91 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll | 5 |
2 files changed, 57 insertions, 39 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 80ba15a64a5..68bac8af3e4 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -383,6 +383,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, } +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -391,8 +401,51 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); - if (MayNeedScavengingEmergencySlot) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + bool AllSGPRSpilledToVGPRs = false; + + if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // XXX - This operates under the assumption that only other SGPR spills are + // users of the frame index. I'm not 100% sure this is correct. The + // StackColoring pass has a comment saying a future improvement would be to + // merging of allocas with spill slots, but for now according to + // MachineFrameInfo isSpillSlot can't alias any other object. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isSGPRSpill(MI)) { + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + } + + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + } + + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || + !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + assert(RS && "RegScavenger required if spilling"); + // We force this to be at offset 0 so no user object ever has 0 as an // address, so we may use 0 as an invalid pointer value. This is because // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca @@ -410,40 +463,6 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( AMDGPU::SGPR_32RegClass.getSize(), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (!TRI.spillSGPRToVGPR()) - return; - - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - if (!FuncInfo->hasSpilledSGPRs()) - return; - - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) - TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); - } - } - } - - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); } void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll index bccddcd9687..bcaaba72403 100644 --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -18,13 +18,12 @@ ; GCN: s_mov_b32 m0 ; Make sure scratch space isn't being used for SGPR->VGPR spills -; FIXME: Seem to be leaving behind unused emergency slot. ; Writing to M0 from an SMRD instruction will hang the GPU. ; GCN-NOT: s_buffer_load_dword m0 ; GCN: s_endpgm -; TOVGPR: ScratchSize: 4{{$}} +; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 @@ -768,7 +767,7 @@ ENDIF66: ; preds = %LOOP65 ; GCN-LABEL: {{^}}main1: ; GCN: s_endpgm -; TOVGPR: ScratchSize: 4{{$}} +; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 |

