6 files changed, 94 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 321323fb21c..007321d4704 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -496,10 +496,12 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
 
-  // FIXME: This is the maximum work group size.  We should try to get
-  // value from the reqd_work_group_size function attribute if it is
-  // available.
-  unsigned WorkGroupSize = 256;
+  const Function &ContainingFunction = *I.getParent()->getParent();
+
+  // FIXME: We should also try to get this value from the reqd_work_group_size
+  // function attribute if it is available.
+  unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
+
   int AllocaSize =
       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
 
@@ -520,7 +522,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   Function *F = I.getParent()->getParent();
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
+  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
       *Mod, GVTy, false, GlobalValue::InternalLinkage,
       UndefValue::get(GVTy),
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ef413cb1d1a..43f4b9f7dde 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,6 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     ReturnsVoid(true),
+    MaximumWorkGroupSize(0),
     LDSWaveSpillSize(0),
     PSInputEna(0),
     NumUserSGPRs(0),
@@ -123,6 +124,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
       ST.isAmdHsaOS())
     FlatScratchInit = true;
+
+  if (AMDGPU::isCompute(F->getCallingConv()))
+    MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
+  else
+    MaximumWorkGroupSize = ST.getWavefrontSize();
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -202,10 +208,5 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
 
 unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
                                               const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  // FIXME: We should get this information from kernel attributes if it
-  // is available.
-  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv()))
-    return 256;
-  return ST.getWavefrontSize();
+  return MaximumWorkGroupSize;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ac3497c31d1..acbd276848f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -60,6 +60,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned PSInputAddr;
   bool ReturnsVoid;
 
+  unsigned MaximumWorkGroupSize;
+
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7090a582a98..7edd41832ec 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -23,6 +23,53 @@
 
 using namespace llvm;
 
+static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
+  const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  unsigned SIMDPerCU = 4;
+
+  unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
+  return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
+           MaxInvocationsPerWave;
+}
+
+static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+
+  unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
+  unsigned ReservedSGPRCount;
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    TotalSGPRCountPerSIMD = 800;
+    AddressableSGPRCount = 102;
+    SGPRUsageAlignment = 16;
+    ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
+  } else {
+    TotalSGPRCountPerSIMD = 512;
+    AddressableSGPRCount = 104;
+    SGPRUsageAlignment = 8;
+    ReservedSGPRCount = 2; // VCC
+  }
+
+  unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
+  MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
+
+  if (ST.hasSGPRInitBug())
+    MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+  return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
+}
+
+static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+  unsigned TotalVGPRCountPerSIMD = 256;
+  unsigned VGPRUsageAlignment = 4;
+
+  return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
+                   VGPRUsageAlignment);
+}
+
 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
   for (unsigned i = 0; PSets[i] != -1; ++i) {
     if (PSets[i] == (int)PSetID)
@@ -71,38 +118,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    // Leave space for flat_scr, xnack_mask, vcc, and alignment
-    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
-    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
-    // 100/101 for vcc. This is the next sgpr128 down.
-    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
-  }
-
-  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+  unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
+  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
-    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Next register before reservations for flat_scr, xnack_mask, vcc,
-    // and scratch resource.
-    return AMDGPU::SGPR91;
+  unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned Reg;
+
+  // Try to place it in a hole after PrivateSegmentbufferReg.
+  if (RegCount & 3) {
+    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+    // alignment constraints, so we have a hole where can put the wave offset.
+    Reg = RegCount - 1;
+  } else {
+    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+    // wave offset before it.
+    Reg = RegCount - 5;
   }
-
-  return AMDGPU::SGPR95;
+  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -124,35 +160,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
 
-  // Reserve the last 2 registers so we will always have at least 2 more that
-  // will physically contain VCC.
-  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+  unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
 
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
-    // for VCC/XNACK_MASK/FLAT_SCR.
-    //
-    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
-    // SGPRs when the XNACK feature is not used. This is currently not done
-    // because the code that counts SGPRs cannot account for such holes.
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
+    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
-  // Tonga and Iceland can only allocate a fixed number of SGPRs due
-  // to a hw bug.
-  if (ST.hasSGPRInitBug()) {
-    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
-    for (unsigned i = Limit; i < NumSGPRs; ++i) {
-      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      reserveRegisterTuples(Reserved, Reg);
-    }
+  for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5e3498f86d6..52f764876f3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -124,6 +124,10 @@ static unsigned getIntegerAttribute(const Function &F, const char *Name,
   return Result;
 }
 
+unsigned getMaximumWorkGroupSize(const Function &F) {
+  return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
+}
+
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d229dec8036..4feb57c9e34 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -45,6 +45,7 @@ bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
 
+unsigned getMaximumWorkGroupSize(const Function &F);
 unsigned getInitialPSInputAddr(const Function &F);
 
 bool isShader(CallingConv::ID cc);