diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-02-01 22:59:50 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-02-01 22:59:50 +0000 |
commit | 2b913b1f493107ae7ffb6c11e59094952d595e1e (patch) | |
tree | ec857df4a9a75225a11a6e275fef0730ffa311ec /llvm/lib | |
parent | c5eb8e29d05c2887a4097be074edee4a1ba6d493 (diff) | |
download | bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.tar.gz bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.zip |
[AMDGPU] Account workgroup size in LDS occupancy limits
Functions matching LDS use to occupancy return results for a workgroup
of 64 workitems. The numbers has to be adjusted for bigger workgroups.
For example a workgroup of size 256 already occupies 4 waves just by
itself. Given that all numbers of LDS use in the compiler are per
workgroup, occupancy shall be multiplied by 4 in this case. Each 64
workitems still limited by the same number, but 4 subrgoups 64 workitems
each can afford 4 times more LDS to get the same occupancy.
In addition change initializes LDS size in the subtarget to a real value
for SI+ targets. This is required since LDS size is a variable in these
calculations.
Differential Revision: https://reviews.llvm.org/D29423
llvm-svn: 293837
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 70 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 |
4 files changed, 25 insertions, 58 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index b928e887192..ca25634afdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -204,7 +204,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { } } - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. @@ -225,7 +226,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); // Program is possibly broken by using more local mem than available. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c85d2159bdb..c413f574cd4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, initializeSubtargetDependencies(TT, GPU, FS); } -// FIXME: These limits are for SI. Did they change with the larger maximum LDS -// size? -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { - switch (NWaves) { - case 10: - return 1638; - case 9: - return 1820; - case 8: - return 2048; - case 7: - return 2340; - case 6: - return 2730; - case 5: - return 3276; - case 4: - return 4096; - case 3: - return 5461; - case 2: - return 8192; - default: +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + if (NWaves == 1) return getLocalMemorySize(); - } + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { - if (Bytes <= 1638) - return 10; - - if (Bytes <= 1820) - return 9; - - if (Bytes <= 2048) - return 8; - - if (Bytes <= 2340) - return 7; - - if (Bytes <= 2730) - return 6; - - if (Bytes <= 3276) - return 5; - - if (Bytes <= 4096) - return 4; - - if (Bytes <= 5461) - return 3; - - if (Bytes <= 8192) - return 2; - - return 1; +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, + const Function &F) const { + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; + unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); + NumWaves = std::min(NumWaves, MaxWaves); + NumWaves = std::max(NumWaves, 1u); + return NumWaves; } std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index f66ebd6afc2..83eda9bfbb6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -274,11 +274,12 @@ public: /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. - unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; bool hasFP16Denormals() const { return FP64FP16Denormals; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index e714eeffad0..cc0cd8b2984 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -35,7 +35,8 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), ST.getOccupancyWithNumVGPRs(VGPRs)); return std::min(MinRegOccupancy, - ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction())); } void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |