[AMDGPU] Account workgroup size in LDS occupancy limits

Functions matching LDS use to occupancy return results for a workgroup of 64 workitems. The numbers has to be adjusted for bigger workgroups. For example a workgroup of size 256 already occupies 4 waves just by itself. Given that all numbers of LDS use in the compiler are per workgroup, occupancy shall be multiplied by 4 in this case. Each 64 workitems still limited by the same number, but 4 subrgoups 64 workitems each can afford 4 times more LDS to get the same occupancy. In addition change initializes LDS size in the subtarget to a real value for SI+ targets. This is required since LDS size is a variable in these calculations. Differential Revision: https://reviews.llvm.org/D29423 llvm-svn: 293837
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-02-01 22:59:50 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-02-01 22:59:50 +0000
commit: 2b913b1f493107ae7ffb6c11e59094952d595e1e (patch)
tree: ec857df4a9a75225a11a6e275fef0730ffa311ec /llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
parent: c5eb8e29d05c2887a4097be074edee4a1ba6d493 (diff)
download: bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.tar.gz
bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.zip
1 files changed, 17 insertions, 53 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c85d2159bdb..c413f574cd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
   initializeSubtargetDependencies(TT, GPU, FS);
 }
 
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
-  switch (NWaves) {
-  case 10:
-    return 1638;
-  case 9:
-    return 1820;
-  case 8:
-    return 2048;
-  case 7:
-    return 2340;
-  case 6:
-    return 2730;
-  case 5:
-    return 3276;
-  case 4:
-    return 4096;
-  case 3:
-    return 5461;
-  case 2:
-    return 8192;
-  default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+  const Function &F) const {
+  if (NWaves == 1)
     return getLocalMemorySize();
-  }
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
-  if (Bytes <= 1638)
-    return 10;
-
-  if (Bytes <= 1820)
-    return 9;
-
-  if (Bytes <= 2048)
-    return 8;
-
-  if (Bytes <= 2340)
-    return 7;
-
-  if (Bytes <= 2730)
-    return 6;
-
-  if (Bytes <= 3276)
-    return 5;
-
-  if (Bytes <= 4096)
-    return 4;
-
-  if (Bytes <= 5461)
-    return 3;
-
-  if (Bytes <= 8192)
-    return 2;
-
-  return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+  const Function &F) const {
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+  NumWaves = std::min(NumWaves, MaxWaves);
+  NumWaves = std::max(NumWaves, 1u);
+  return NumWaves;
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-02-01 22:59:50 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-02-01 22:59:50 +0000
commit	2b913b1f493107ae7ffb6c11e59094952d595e1e (patch)
tree	ec857df4a9a75225a11a6e275fef0730ffa311ec /llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
parent	c5eb8e29d05c2887a4097be074edee4a1ba6d493 (diff)
download	bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.tar.gz bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.zip