diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2015-11-30 21:16:03 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2015-11-30 21:16:03 +0000 |
commit | 26f8f3db39f8fd856cfdce0689c5e5b84536eb9a (patch) | |
tree | a7241314f78b6bf81416571bf7b298cd29b6a884 /llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | |
parent | ac234b604da563cbe53364116cf84abe4abbdb04 (diff) | |
download | bcm5719-llvm-26f8f3db39f8fd856cfdce0689c5e5b84536eb9a.tar.gz bcm5719-llvm-26f8f3db39f8fd856cfdce0689c5e5b84536eb9a.zip |
AMDGPU: Rework how private buffer passed for HSA
If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.
If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.
This also only selectively enables all of the input registers
which are really required instead of always enabling them.
llvm-svn: 254331
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 80 |
1 files changed, 71 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d042844aa13..935aad42719 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), + NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), DispatchID(false), - KernargSegmentPtr(true), + KernargSegmentPtr(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(true), WorkItemIDY(false), WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); const Function *F = MF.getFunction(); - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; @@ -66,14 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; } -void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { - // We need to round up to next multiple of 4. - unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); - unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); - ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, - &AMDGPU::SReg_128RegClass); +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( |