diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-02-01 22:59:50 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-02-01 22:59:50 +0000 |
commit | 2b913b1f493107ae7ffb6c11e59094952d595e1e (patch) | |
tree | ec857df4a9a75225a11a6e275fef0730ffa311ec /llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | |
parent | c5eb8e29d05c2887a4097be074edee4a1ba6d493 (diff) | |
download | bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.tar.gz bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.zip |
[AMDGPU] Account workgroup size in LDS occupancy limits
Functions matching LDS use to occupancy return results for a workgroup
of 64 workitems. The numbers has to be adjusted for bigger workgroups.
For example a workgroup of size 256 already occupies 4 waves just by
itself. Given that all numbers of LDS use in the compiler are per
workgroup, occupancy shall be multiplied by 4 in this case. Each 64
workitems still limited by the same number, but 4 subrgoups 64 workitems
each can afford 4 times more LDS to get the same occupancy.
In addition change initializes LDS size in the subtarget to a real value
for SI+ targets. This is required since LDS size is a variable in these
calculations.
Differential Revision: https://reviews.llvm.org/D29423
llvm-svn: 293837
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 70 |
1 files changed, 17 insertions, 53 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c85d2159bdb..c413f574cd4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, initializeSubtargetDependencies(TT, GPU, FS); } -// FIXME: These limits are for SI. Did they change with the larger maximum LDS -// size? -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { - switch (NWaves) { - case 10: - return 1638; - case 9: - return 1820; - case 8: - return 2048; - case 7: - return 2340; - case 6: - return 2730; - case 5: - return 3276; - case 4: - return 4096; - case 3: - return 5461; - case 2: - return 8192; - default: +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + if (NWaves == 1) return getLocalMemorySize(); - } + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { - if (Bytes <= 1638) - return 10; - - if (Bytes <= 1820) - return 9; - - if (Bytes <= 2048) - return 8; - - if (Bytes <= 2340) - return 7; - - if (Bytes <= 2730) - return 6; - - if (Bytes <= 3276) - return 5; - - if (Bytes <= 4096) - return 4; - - if (Bytes <= 5461) - return 3; - - if (Bytes <= 8192) - return 2; - - return 1; +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, + const Function &F) const { + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; + unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); + NumWaves = std::min(NumWaves, MaxWaves); + NumWaves = std::max(NumWaves, 1u); + return NumWaves; } std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |