summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h21
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h3
5 files changed, 43 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 743ac64b8f1..a429c7cd833 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -507,6 +507,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
OutStreamer->emitRawComment(
+ " Occupancy: " +
+ Twine(CurrentProgramInfo.Occupancy), false);
+
+ OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
@@ -1057,6 +1061,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+
+ ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+ ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU);
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 716c3879478..f9a9679ac68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
HasTrigReducedRange(false),
+ MaxWavesPerEU(10),
LocalMemorySize(0),
WavefrontSize(0)
{ }
@@ -278,6 +279,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+ MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
@@ -566,7 +568,7 @@ bool GCNSubtarget::hasMadF16() const {
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
- return 10;
+ return getMaxWavesPerEU();
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
@@ -616,6 +618,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
+unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
+ unsigned LDSSize,
+ unsigned NumSGPRs,
+ unsigned NumVGPRs) const {
+ unsigned Occupancy =
+ std::min(getMaxWavesPerEU(),
+ getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+ if (NumSGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
+ return Occupancy;
+}
+
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1f30d76de0d..bc100915ac7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -75,6 +75,7 @@ protected:
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
bool HasTrigReducedRange;
+ unsigned MaxWavesPerEU;
int LocalMemorySize;
unsigned WavefrontSize;
@@ -223,7 +224,9 @@ public:
/// subtarget.
virtual unsigned getMinWavesPerEU() const = 0;
- unsigned getMaxWavesPerEU() const { return 10; }
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
/// Creates value range metadata on an workitemid.* inrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
@@ -245,6 +248,9 @@ public:
class GCNSubtarget : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
+
+ using AMDGPUSubtarget::getMaxWavesPerEU;
+
public:
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
@@ -881,12 +887,6 @@ public:
return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
}
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget without any kind of limitation.
- unsigned getMaxWavesPerEU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(this);
- }
-
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
@@ -1036,6 +1036,13 @@ public:
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ /// Return occupancy for the given function. Used LDS and a number of
+ /// registers if provided.
+ /// Note, occupancy can be affected by the scratch allocation as well, but
+ /// we do not have enough information to compute it.
+ unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+ unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 46da974a2f4..d9068d6d22f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
- Occupancy = getMaxWavesPerEU();
- limitOccupancy(MF);
+ Occupancy = ST.computeOccupancy(MF, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 168f05f8fdd..94ebe693feb 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -51,6 +51,9 @@ struct SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
uint32_t NumVGPRsForWavesPerEU = 0;
+ // Final occupancy.
+ uint32_t Occupancy = 0;
+
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
bool DynamicCallStack = false;
OpenPOWER on IntegriCloud