diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-29 19:35:00 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-29 19:35:00 +0000 |
commit | 1ea0402e82f6b159a8e95e29fd33472efa17169b (patch) | |
tree | d9fce38de26dd111cc57425432af70a8d469c3d5 /llvm/lib/Target/AMDGPU | |
parent | 97684419e8306e8a80b93cfb56af20e73d0898a3 (diff) | |
download | bcm5719-llvm-1ea0402e82f6b159a8e95e29fd33472efa17169b.tar.gz bcm5719-llvm-1ea0402e82f6b159a8e95e29fd33472efa17169b.zip |
AMDGPU: Round up kernel argument allocation size
AFAIK the driver's allocation will actually have to round this
up anyway. It is useful to track the rounded up size, so that
the end of the kernel segment is known to be dereferencable so
a wider s_load_dword can be used for a short argument at the end
of the segment.
llvm-svn: 333456
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 5 |
3 files changed, 13 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 6f50fca8831..bcc0e77a545 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -20,6 +20,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { /// local memory space. SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects; +protected: uint64_t KernArgSize; unsigned MaxKernArgAlign; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index a52b1137203..97fc6493b95 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -414,12 +414,16 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { unsigned SISubtarget::getKernArgSegmentSize(const Function &F, unsigned ExplicitArgBytes) const { + uint64_t TotalSize = ExplicitArgBytes; unsigned ImplicitBytes = getImplicitArgNumBytes(F); - if (ImplicitBytes == 0) - return ExplicitArgBytes; - unsigned Alignment = getAlignmentForImplicitArgPtr(); - return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + if (ImplicitBytes != 0) { + unsigned Alignment = getAlignmentForImplicitArgPtr(); + TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + } + + // Being able to dereference past the end is useful for emitting scalar loads. + return alignTo(TotalSize, 4); } unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 495a8534865..61b6cb33fd1 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -71,8 +71,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; } else { - if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) + if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { KernargSegmentPtr = true; + assert(MaxKernArgAlign == 0); + MaxKernArgAlign = ST.getAlignmentForImplicitArgPtr(); + } } CallingConv::ID CC = F.getCallingConv(); |