summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2017-02-01 22:59:50 +0000
committerStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2017-02-01 22:59:50 +0000
commit2b913b1f493107ae7ffb6c11e59094952d595e1e (patch)
treeec857df4a9a75225a11a6e275fef0730ffa311ec /llvm/lib
parentc5eb8e29d05c2887a4097be074edee4a1ba6d493 (diff)
downloadbcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.tar.gz
bcm5719-llvm-2b913b1f493107ae7ffb6c11e59094952d595e1e.zip
[AMDGPU] Account workgroup size in LDS occupancy limits
Functions matching LDS use to occupancy return results for a workgroup of 64 workitems. The numbers has to be adjusted for bigger workgroups. For example a workgroup of size 256 already occupies 4 waves just by itself. Given that all numbers of LDS use in the compiler are per workgroup, occupancy shall be multiplied by 4 in this case. Each 64 workitems still limited by the same number, but 4 subrgoups 64 workitems each can afford 4 times more LDS to get the same occupancy. In addition change initializes LDS size in the subtarget to a real value for SI+ targets. This is required since LDS size is a variable in these calculations. Differential Revision: https://reviews.llvm.org/D29423 llvm-svn: 293837
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp70
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h5
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp3
4 files changed, 25 insertions, 58 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b928e887192..ca25634afdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -204,7 +204,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
}
}
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+ F);
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
@@ -225,7 +226,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount
- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
// Program is possibly broken by using more local mem than available.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c85d2159bdb..c413f574cd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
initializeSubtargetDependencies(TT, GPU, FS);
}
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
- switch (NWaves) {
- case 10:
- return 1638;
- case 9:
- return 1820;
- case 8:
- return 2048;
- case 7:
- return 2340;
- case 6:
- return 2730;
- case 5:
- return 3276;
- case 4:
- return 4096;
- case 3:
- return 5461;
- case 2:
- return 8192;
- default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+ const Function &F) const {
+ if (NWaves == 1)
return getLocalMemorySize();
- }
+ unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+ unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ unsigned MaxWaves = getMaxWavesPerEU();
+ return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
- if (Bytes <= 1638)
- return 10;
-
- if (Bytes <= 1820)
- return 9;
-
- if (Bytes <= 2048)
- return 8;
-
- if (Bytes <= 2340)
- return 7;
-
- if (Bytes <= 2730)
- return 6;
-
- if (Bytes <= 3276)
- return 5;
-
- if (Bytes <= 4096)
- return 4;
-
- if (Bytes <= 5461)
- return 3;
-
- if (Bytes <= 8192)
- return 2;
-
- return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+ const Function &F) const {
+ unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+ unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ unsigned MaxWaves = getMaxWavesPerEU();
+ unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+ unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+ NumWaves = std::min(NumWaves, MaxWaves);
+ NumWaves = std::max(NumWaves, 1u);
+ return NumWaves;
}
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index f66ebd6afc2..83eda9bfbb6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -274,11 +274,12 @@ public:
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
- unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+ const Function &) const;
/// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
/// the given LDS memory size is the only constraint.
- unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
bool hasFP16Denormals() const {
return FP64FP16Denormals;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e714eeffad0..cc0cd8b2984 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -35,7 +35,8 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
ST.getOccupancyWithNumVGPRs(VGPRs));
return std::min(MinRegOccupancy,
- ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+ ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+ *MF.getFunction()));
}
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
OpenPOWER on IntegriCloud