diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp | 797 | 
4 files changed, 803 insertions, 0 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 2f79687de85..e975bd63197 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -221,6 +221,9 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();  void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);  extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; +void initializeGCNRegBankReassignPass(PassRegistry &); +extern char &GCNRegBankReassignID; +  void initializeGCNNSAReassignPass(PassRegistry &);  extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3c41ecdf597..ac996f197b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -234,6 +234,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {    initializeAMDGPUUseNativeCallsPass(*PR);    initializeAMDGPUSimplifyLibCallsPass(*PR);    initializeAMDGPUInlinerPass(*PR); +  initializeGCNRegBankReassignPass(*PR);    initializeGCNNSAReassignPass(*PR);  } @@ -937,6 +938,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {  bool GCNPassConfig::addPreRewrite() {    if (EnableRegReassign) {      addPass(&GCNNSAReassignID); +    addPass(&GCNRegBankReassignID);    }    return true;  } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 08a64cd9512..a75a73ba6e4 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen    SIShrinkInstructions.cpp    SIWholeQuadMode.cpp    GCNILPSched.cpp +  GCNRegBankReassign.cpp    GCNNSAReassign.cpp    GCNDPPCombine.cpp    SIModeRegister.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp new file mode 100644 index 00000000000..fcbc3cddf6e --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -0,0 +1,797 @@ +//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ to reduce register bank +/// conflicts. +/// +/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in +/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to +/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, +/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. +/// +/// The shader can read one dword from each of these banks once per cycle. +/// If an instruction has to read more register operands from the same bank +/// an additional cycle is needed. HW attempts to pre-load registers through +/// input operand gathering, but a stall cycle may occur if that fails. For +/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, +/// potentially incuring 2 stall cycles. +/// +/// The pass tries to reassign registers to reduce bank conflicts. +/// +/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so +/// that 4 has to be subtracted from an SGPR bank number to get the real value. +/// This also corresponds to bit numbers in bank masks used in the pass. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", +  cl::desc("Verify stall cycles in the regbanks reassign pass"), +  cl::value_desc("0|1|2"), +  cl::init(0), cl::Hidden); + +#define DEBUG_TYPE "amdgpu-regbanks-reassign" + +#define NUM_VGPR_BANKS 4 +#define NUM_SGPR_BANKS 8 +#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) +#define SGPR_BANK_OFFSET NUM_VGPR_BANKS +#define VGPR_BANK_MASK 0xf +#define SGPR_BANK_MASK 0xff0 +#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) + +STATISTIC(NumStallsDetected, +          "Number of operand read stalls detected"); +STATISTIC(NumStallsRecovered, +          "Number of operand read stalls recovered"); + +namespace { + +class GCNRegBankReassign : public MachineFunctionPass { + +  class OperandMask { +  public: +    OperandMask(unsigned r, unsigned s, unsigned m) +      : Reg(r), SubReg(s), Mask(m) {} +    unsigned Reg; +    unsigned SubReg; +    unsigned Mask; +  }; + +  class Candidate { +  public: +    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, +              unsigned weight) +      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + +    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +    void dump(const GCNRegBankReassign *P) const { +      MI->dump(); +      dbgs() << P->printReg(Reg) << " to banks "; +      dumpFreeBanks(FreeBanks); +      dbgs() << " weight " << Weight << '\n'; +    } +#endif + +    MachineInstr *MI; +    unsigned Reg; +    unsigned FreeBanks; +    unsigned Weight; +  }; + +  class CandidateList : public std::list<Candidate> { +  public: +    // Speedup subsequent sort. +    void push(const Candidate&& C) { +      if (C.Weight) push_back(C); +      else push_front(C); +    } +  }; + +public: +  static char ID; + +public: +  GCNRegBankReassign() : MachineFunctionPass(ID) { +    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnMachineFunction(MachineFunction &MF) override; + +  StringRef getPassName() const override { return "GCN RegBank Reassign"; } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<MachineLoopInfo>(); +    AU.addRequired<LiveIntervals>(); +    AU.addRequired<VirtRegMap>(); +    AU.addRequired<LiveRegMatrix>(); +    AU.setPreservesAll(); +    MachineFunctionPass::getAnalysisUsage(AU); +  } + +private: +  const GCNSubtarget *ST; + +  const MachineRegisterInfo *MRI; + +  const SIRegisterInfo *TRI; + +  MachineLoopInfo *MLI; + +  VirtRegMap *VRM; + +  LiveRegMatrix *LRM; + +  LiveIntervals *LIS; + +  unsigned MaxNumVGPRs; + +  unsigned MaxNumSGPRs; + +  BitVector RegsUsed; + +  SmallVector<OperandMask, 8> OperandMasks; + +  CandidateList Candidates; + +  const MCPhysReg *CSRegs; + +  // Returns bank for a phys reg. +  unsigned getPhysRegBank(unsigned Reg) const; + +  // Return a bit set for each register bank used. 4 banks for VGPRs and +  // 8 banks for SGPRs. +  // Registers already processed and recorded in RegsUsed are excluded. +  // If Bank is not -1 assume Reg:SubReg to belong to that Bank. +  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); + +  // Return number of stalls in the instructions. +  // UsedBanks has bits set for the banks used by all operands. +  // If Reg and Bank provided substitute the Reg with the Bank. +  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, +                       unsigned Reg = AMDGPU::NoRegister, int Bank = -1); + +  // Return true if register is regular VGPR or SGPR or their tuples. +  // Returns false for special registers like m0, vcc etc. +  bool isReassignable(unsigned Reg) const; + +  // Check if registers' defs are old and may be pre-loaded. +  // Returns 0 if both registers are old enough, 1 or 2 if one or both +  // registers will not likely be pre-loaded. +  unsigned getOperandGatherWeight(const MachineInstr& MI, +                                  unsigned Reg1, +                                  unsigned Reg2, +                                  unsigned StallCycles) const; + + +  // Find all bank bits in UsedBanks where Mask can be relocated to. +  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; + +  // Find all bank bits in UsedBanks where Mask can be relocated to. +  // Bank is relative to the register and not its subregister component. +  // Returns 0 is a register is not reassignable. +  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, +                        unsigned UsedBanks) const; + +  // Add cadidate instruction to the work list. +  void collectCandidates(MachineInstr& MI, unsigned UsedBanks, +                         unsigned StallCycles); + +  // Collect cadidate instructions across function. Returns a number stall +  // cycles detected. Only counts stalls if Collect is false. +  unsigned collectCandidates(MachineFunction &MF, bool Collect = true); + +  // Remove all candidates that read specified register. +  void removeCandidates(unsigned Reg); + +  // Compute stalls within the uses of SrcReg replaced by a register from +  // Bank. If Bank is -1 does not perform substitution. If Collect is set +  // candidates are collected and added to work list. +  unsigned computeStallCycles(unsigned SrcReg, +                              unsigned Reg = AMDGPU::NoRegister, +                              int Bank = -1, bool Collect = false); + +  // Search for a register in Bank unused within LI. +  // Returns phys reg or NoRegister. +  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + +  // Try to reassign candidate. Returns number or stall cycles saved. +  unsigned tryReassign(Candidate &C); + +  bool verifyCycles(MachineFunction &MF, +                    unsigned OriginalCycles, unsigned CyclesSaved); + + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +public: +  Printable printReg(unsigned Reg, unsigned SubReg = 0) const { +    return Printable([Reg, SubReg, this](raw_ostream &OS) { +      if (TargetRegisterInfo::isPhysicalRegister(Reg)) { +        OS << llvm::printReg(Reg, TRI); +        return; +      } +      if (!VRM->isAssignedReg(Reg)) +        OS << "<unassigned> " << llvm::printReg(Reg, TRI); +      else +        OS << llvm::printReg(Reg, TRI) << '(' +           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; +      if (SubReg) +        OS << ':' << TRI->getSubRegIndexName(SubReg); +    }); +  } + +  static Printable printBank(unsigned Bank) { +    return Printable([Bank](raw_ostream &OS) { +      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); +    }); +  } + +  static void dumpFreeBanks(unsigned FreeBanks) { +    for (unsigned L = 0; L < NUM_BANKS; ++L) +      if (FreeBanks & (1 << L)) +        dbgs() << printBank(L) << ' '; +  } +#endif +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", +                      false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", +                    false, false) + + +char GCNRegBankReassign::ID = 0; + +char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; + +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { +  assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + +  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); +  unsigned Size = TRI->getRegSizeInBits(*RC); +  if (Size > 32) +    Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + +  if (TRI->hasVGPRs(RC)) { +    Reg -= AMDGPU::VGPR0; +    return Reg % NUM_VGPR_BANKS; +  } + +  Reg = TRI->getEncodingValue(Reg) / 2; +  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, +                                            int Bank) { +  if (TargetRegisterInfo::isVirtualRegister(Reg)) { +    if (!VRM->isAssignedReg(Reg)) +      return 0; + +    Reg = VRM->getPhys(Reg); +    if (!Reg) +      return 0; +    if (SubReg) +      Reg = TRI->getSubReg(Reg, SubReg); +  } + +  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); +  unsigned Size = TRI->getRegSizeInBits(*RC) / 32; +  if (Size > 1) +    Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + +  if (TRI->hasVGPRs(RC)) { +    // VGPRs have 4 banks assigned in a round-robin fashion. +    Reg -= AMDGPU::VGPR0; +    unsigned Mask = (1 << Size) - 1; +    unsigned Used = 0; +    // Bitmask lacks an extract method +    for (unsigned I = 0; I < Size; ++I) +      if (RegsUsed.test(Reg + I)) +        Used |= 1 << I; +    RegsUsed.set(Reg, Reg + Size); +    Mask &= ~Used; +    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); +    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; +  } + +  // SGPRs have 8 banks holding 2 consequitive registers each. +  Reg = TRI->getEncodingValue(Reg) / 2; +  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); +  if (Reg + StartBit >= RegsUsed.size()) +    return 0; + +  if (Size > 1) +    Size /= 2; +  unsigned Mask = (1 << Size) - 1; +  unsigned Used = 0; +  for (unsigned I = 0; I < Size; ++I) +    if (RegsUsed.test(StartBit + Reg + I)) +      Used |= 1 << I; +  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); +  Mask &= ~Used; +  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS +                        : unsigned(Bank - SGPR_BANK_OFFSET); +  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; +  // Reserve 4 bank ids for VGPRs. +  return Mask << SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, +                                         unsigned& UsedBanks, +                                         unsigned Reg, +                                         int Bank) { +  unsigned StallCycles = 0; +  UsedBanks = 0; + +  if (MI.isDebugValue()) +    return 0; + +  RegsUsed.reset(); +  OperandMasks.clear(); +  for (const auto& Op : MI.explicit_uses()) { +    // Undef can be assigned to any register, so two vregs can be assigned +    // the same phys reg within the same instruction. +    if (!Op.isReg() || Op.isUndef()) +      continue; + +    unsigned R = Op.getReg(); +    unsigned ShiftedBank = Bank; + +    if (Bank != -1 && R == Reg && Op.getSubReg()) { +      unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); +      if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { +        // If a register spans all banks we cannot shift it to avoid conflict. +        if (countPopulation(LM) >= NUM_VGPR_BANKS) +          continue; +        ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; +      } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { +        // If a register spans all banks we cannot shift it to avoid conflict. +        if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) +          continue; +        ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + +                                          (countTrailingZeros(LM) >> 1)) % +                                             NUM_SGPR_BANKS; +      } +    } + +    unsigned Mask = getRegBankMask(R, Op.getSubReg(), +                                   (Reg == R) ? ShiftedBank : -1); +    StallCycles += countPopulation(UsedBanks & Mask); +    UsedBanks |= Mask; +    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); +  } + +  return StallCycles; +} + +unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, +                                                    unsigned Reg1, +                                                    unsigned Reg2, +                                                    unsigned StallCycles) const +{ +  unsigned Defs = 0; +  MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); +  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); +  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { +    if (MI.isDebugInstr()) +      continue; +    --Def; +    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) +      continue; +    if (Def->modifiesRegister(Reg1, TRI)) +      Defs |= 1; +    if (Def->modifiesRegister(Reg2, TRI)) +      Defs |= 2; +  } +  return countPopulation(Defs); +} + +bool GCNRegBankReassign::isReassignable(unsigned Reg) const { +  if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) +    return false; + +  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + +  unsigned PhysReg = VRM->getPhys(Reg); + +  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) +    return false; + +  for (auto U : MRI->use_nodbg_operands(Reg)) { +    if (U.isImplicit()) +      return false; +    const MachineInstr *UseInst = U.getParent(); +    if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) +      return false; +  } + +  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); +  if (TRI->hasVGPRs(RC)) +    return true; + +  unsigned Size = TRI->getRegSizeInBits(*RC); +  if (Size > 32) +    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); + +  return AMDGPU::SGPR_32RegClass.contains(PhysReg); +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, +                                          unsigned UsedBanks) const { +  unsigned Size = countPopulation(Mask); +  unsigned FreeBanks = 0; +  unsigned Bank = findFirstSet(Mask); + +  UsedBanks &= ~Mask; + +  // Find free VGPR banks +  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { +    for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { +      if (Bank == I) +        continue; +      unsigned NewMask = ((1 << Size) - 1) << I; +      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; +      if (!(UsedBanks & NewMask)) +        FreeBanks |= 1 << I; +    } +    return FreeBanks; +  } + +  // Find free SGPR banks +  // SGPR tuples must be aligned, so step is size in banks it +  // crosses. +  Bank -= SGPR_BANK_OFFSET; +  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { +    if (Bank == I) +      continue; +    unsigned NewMask = ((1 << Size) - 1) << I; +    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; +    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) +      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; +  } + +  return FreeBanks; +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, +                                          unsigned SubReg, +                                          unsigned Mask, +                                          unsigned UsedBanks) const { +  if (!isReassignable(Reg)) +    return 0; + +  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); + +  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); +  if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { +    unsigned Shift = countTrailingZeros(LM); +    if (Shift >= NUM_VGPR_BANKS) +      return 0; +    unsigned VB = FreeBanks & VGPR_BANK_MASK; +    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & +                VGPR_BANK_MASK; +  } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { +    unsigned Shift = countTrailingZeros(LM) >> 1; +    if (Shift >= NUM_SGPR_BANKS) +      return 0; +    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; +    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & +                SGPR_BANK_SHIFTED_MASK; +    FreeBanks <<= SGPR_BANK_OFFSET; +  } + +  LLVM_DEBUG(if (FreeBanks) { +          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) +                 << " to banks: "; dumpFreeBanks(FreeBanks); +          dbgs() << '\n'; }); + +  return FreeBanks; +} + +void GCNRegBankReassign::collectCandidates(MachineInstr& MI, +                                           unsigned UsedBanks, +                                           unsigned StallCycles) { +  LLVM_DEBUG(MI.dump()); + +  if (!StallCycles) +    return; + +  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); + +  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { +    for (unsigned J = I + 1; J != E; ++J) { +      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) +        continue; + +      unsigned Reg1 = OperandMasks[I].Reg; +      unsigned Reg2 = OperandMasks[J].Reg; +      unsigned SubReg1 = OperandMasks[I].SubReg; +      unsigned SubReg2 = OperandMasks[J].SubReg; +      unsigned Mask1 = OperandMasks[I].Mask; +      unsigned Mask2 = OperandMasks[J].Mask; +      unsigned Size1 = countPopulation(Mask1); +      unsigned Size2 = countPopulation(Mask2); + +      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << +                      " and " << printReg(Reg2, SubReg2) << '\n'); + +      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); +      Weight += MLI->getLoopDepth(MI.getParent()) * 10; + +      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); + +      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); +      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); +      if (FreeBanks1) +        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight +                                    + ((Size2 > Size1) ? 1 : 0))); +      if (FreeBanks2) +        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight +                                    + ((Size1 > Size2) ? 1 : 0))); +    } +  } +} + +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, +                                                unsigned Reg, int Bank, +                                                bool Collect) { +  unsigned TotalStallCycles = 0; +  unsigned UsedBanks = 0; +  SmallSet<const MachineInstr *, 16> Visited; + +  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { +    if (MI.isBundle()) +      continue; +    if (!Visited.insert(&MI).second) +      continue; +    unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); +    TotalStallCycles += StallCycles; +    if (Collect) +      collectCandidates(MI, UsedBanks, StallCycles); +  } + +  return TotalStallCycles; +} + +unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, +                                         unsigned Bank) const { +  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); +  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs +                                                : MaxNumSGPRs; +  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 +                                                        : AMDGPU::SGPR0); + +  for (unsigned Reg : RC->getRegisters()) { +    // Check occupancy limit. +    if (TRI->isSubRegisterEq(Reg, MaxReg)) +      break; + +    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) +      continue; + +    for (unsigned I = 0; CSRegs[I]; ++I) +      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && +          !LRM->isPhysRegUsed(CSRegs[I])) +        return AMDGPU::NoRegister; + +    LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); + +    if (!LRM->checkInterference(LI, Reg)) +      return Reg; +  } + +  return AMDGPU::NoRegister; +} + +unsigned GCNRegBankReassign::tryReassign(Candidate &C) { +  if (!LIS->hasInterval(C.Reg)) +    return 0; + +  LiveInterval &LI = LIS->getInterval(C.Reg); +  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); +             LI.dump()); + +  // For each candidate bank walk all instructions in the range of live +  // interval and check if replacing the register with one belonging to +  // the candidate bank reduces conflicts. + +  unsigned OrigStalls = computeStallCycles(C.Reg); +  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); +  if (!OrigStalls) +    return 0; + +  struct BankStall { +    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; +    bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } +    unsigned Bank; +    unsigned Stalls; +  }; +  SmallVector<BankStall, 8> BankStalls; + +  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { +    if (C.FreeBanks & (1 << Bank)) { +      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); +      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); +      if (Stalls < OrigStalls) { +        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " +                     << Stalls << '\n'); +        BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); +      } +    } +  } +  std::sort(BankStalls.begin(), BankStalls.end()); + +  unsigned OrigReg = VRM->getPhys(C.Reg); +  LRM->unassign(LI); +  while (!BankStalls.empty()) { +    BankStall BS = BankStalls.pop_back_val(); +    unsigned Reg = scavengeReg(LI, BS.Bank); +    if (Reg == AMDGPU::NoRegister) { +      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) +                   << '\n'); +      continue; +    } +    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) +                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") +                 << " in bank " << printBank(BS.Bank) << '\n'); + +    LRM->assign(LI, Reg); + +    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); + +    return OrigStalls - BS.Stalls; +  } +  LRM->assign(LI, OrigReg); + +  return 0; +} + +unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, +                                               bool Collect) { +  unsigned TotalStallCycles = 0; + +  for (MachineBasicBlock &MBB : MF) { + +    LLVM_DEBUG(if (Collect) { +            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); +            else dbgs() << MBB.getName(); dbgs() << ":\n"; +          }); + +    for (MachineInstr &MI : MBB.instrs()) { +      if (MI.isBundle()) +          continue; // we analyze the instructions inside the bundle individually + +      unsigned UsedBanks = 0; +      unsigned StallCycles = analyzeInst(MI, UsedBanks); + +      if (Collect) +        collectCandidates(MI, UsedBanks, StallCycles); + +      TotalStallCycles += StallCycles; +    } + +    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); +  } + +  return TotalStallCycles; +} + +void GCNRegBankReassign::removeCandidates(unsigned Reg) { +  Candidates.remove_if([Reg, this](const Candidate& C) { +    return C.MI->readsRegister(Reg, TRI); +  }); +} + +bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, +                                      unsigned OriginalCycles, +                                      unsigned CyclesSaved) { +  unsigned StallCycles = collectCandidates(MF, false); +  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles +               << " stall cycles left\n"); +  return StallCycles + CyclesSaved == OriginalCycles; +} + +bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { +  ST = &MF.getSubtarget<GCNSubtarget>(); +  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) +    return false; + +  MRI = &MF.getRegInfo(); +  TRI = ST->getRegisterInfo(); +  MLI = &getAnalysis<MachineLoopInfo>(); +  VRM = &getAnalysis<VirtRegMap>(); +  LRM = &getAnalysis<LiveRegMatrix>(); +  LIS = &getAnalysis<LiveIntervals>(); + +  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); +  unsigned Occupancy = MFI->getOccupancy(); +  MaxNumVGPRs = ST->getMaxNumVGPRs(MF); +  MaxNumSGPRs = ST->getMaxNumSGPRs(MF); +  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); +  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); + +  CSRegs = TRI->getCalleeSavedRegs(&MF); + +  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + +                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); + +  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() +               << '\n'); + +  unsigned StallCycles = collectCandidates(MF); +  NumStallsDetected += StallCycles; + +  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " +                  "function " << MF.getName() << '\n'); + +  Candidates.sort(); + +  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; +        for (auto C : Candidates) C.dump(this); +        dbgs() << "\n\n"); + +  unsigned CyclesSaved = 0; +  while (!Candidates.empty()) { +    Candidate C = Candidates.back(); +    unsigned LocalCyclesSaved = tryReassign(C); +    CyclesSaved += LocalCyclesSaved; + +    if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) +      report_fatal_error("RegBank reassign stall cycles verification failed."); + +    Candidates.pop_back(); +    if (LocalCyclesSaved) { +      removeCandidates(C.Reg); +      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); +      Candidates.sort(); + +      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; +            for (auto C : Candidates) +              C.dump(this); +            dbgs() << "\n\n"); +    } +  } +  NumStallsRecovered += CyclesSaved; + +  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved +               << " cycles saved in function " << MF.getName() << '\n'); + +  Candidates.clear(); + +  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) +    report_fatal_error("RegBank reassign stall cycles verification failed."); + +  RegsUsed.clear(); + +  return CyclesSaved > 0; +} | 

