AMDGPU: Make s34 the FP register

Make the FP register callee saved. This is tricky because now the FP needs to be spilled in the prolog relative to the incoming SP register, rather than the frame register used throughout the rest of the function. I don't like how this bypassess the standard mechanism for CSR spills just to get the correct insert point. I may look for a better solution, since all CSR VGPRs may also need to have all lanes activated. Another option might be to make getFrameIndexReference change the base register if the frame index is a CSR, and then try to figure out the right insertion point in emitProlog. If there is a free VGPR lane available for SGPR spilling, try to use it for the FP. If that would require intrtoducing a new VGPR spill, try to use a free call clobbered SGPR. Only fallback to introducing a new VGPR spill as a last resort. This also doesn't attempt to handle SGPR spilling with scalar stores. llvm-svn: 365372
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-07-08 19:03:38 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-07-08 19:03:38 +0000
commit: 71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4 (patch)
tree: 160cce173c40a62b1aa78731c2d7894dcb142f22 /llvm/lib/Target
parent: ed499a36b67cf46cbf66052cfe374c80a595f1c1 (diff)
download: bcm5719-llvm-71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4.tar.gz
bcm5719-llvm-71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4.zip
6 files changed, 436 insertions, 142 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7b3105b6790..25d6b2fd7c4 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,6 +21,9 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "frame-info"
+
+
 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
                                          const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
@@ -33,6 +36,150 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
                       ST.getMaxNumSGPRs(MF));
 }
 
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+                                                 LivePhysRegs &LiveRegs,
+                                                 const TargetRegisterClass &RC,
+                                                 bool Unused = false) {
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  if (Unused) {
+    // We are looking for a register that can be used throughout the entire
+    // function, so any use is unacceptable.
+    for (unsigned Reg : RC) {
+      if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+        return Reg;
+    }
+  } else {
+    for (unsigned Reg : RC) {
+      if (LiveRegs.available(MRI, Reg))
+        return Reg;
+    }
+  }
+
+  // If we require an unused register, this is used in contexts where failure is
+  // an option and has an alternative plan. In other contexts, this must
+  // succeed0.
+  if (!Unused)
+    report_fatal_error("failed to find free scratch register");
+
+  return AMDGPU::NoRegister;
+}
+
+static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
+  LivePhysRegs LiveRegs;
+  LiveRegs.init(*MRI.getTargetRegisterInfo());
+  return findScratchNonCalleeSaveRegister(
+    MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+}
+
+// We need to specially emit stack operations here because a different frame
+// register is used than in the rest of the function, as getFrameRegister would
+// use.
+static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             const SIInstrInfo *TII, unsigned SpillReg,
+                             unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+  MachineFunction *MF = MBB.getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+
+  int64_t Offset = MFI.getObjectOffset(FI);
+
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
+      MFI.getObjectAlignment(FI));
+
+  if (isUInt<12>(Offset)) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
+      .addReg(SpillReg, RegState::Kill)
+      .addReg(ScratchRsrcReg)
+      .addReg(SPReg)
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+    return;
+  }
+
+  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+    .addImm(Offset);
+
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+    .addReg(SpillReg, RegState::Kill)
+    .addReg(OffsetReg, RegState::Kill)
+    .addReg(ScratchRsrcReg)
+    .addReg(SPReg)
+    .addImm(0)
+    .addImm(0) // glc
+    .addImm(0) // slc
+    .addImm(0) // tfe
+    .addImm(0) // dlc
+    .addMemOperand(MMO);
+}
+
+static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I,
+                              const SIInstrInfo *TII, unsigned SpillReg,
+                              unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+  MachineFunction *MF = MBB.getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  int64_t Offset = MFI.getObjectOffset(FI);
+
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
+      MFI.getObjectAlignment(FI));
+
+  if (isUInt<12>(Offset)) {
+    BuildMI(MBB, I, DebugLoc(),
+            TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
+      .addReg(ScratchRsrcReg)
+      .addReg(SPReg)
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+    return;
+  }
+
+  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+    .addImm(Offset);
+
+  BuildMI(MBB, I, DebugLoc(),
+          TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
+    .addReg(OffsetReg, RegState::Kill)
+    .addReg(ScratchRsrcReg)
+    .addReg(SPReg)
+    .addImm(0)
+    .addImm(0) // glc
+    .addImm(0) // slc
+    .addImm(0) // tfe
+    .addImm(0) // dlc
+    .addMemOperand(MMO);
+}
+
 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
@@ -511,35 +658,6 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
   }
 }
 
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer.  We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
-                                                 LivePhysRegs &LiveRegs,
-                                                 const TargetRegisterClass &RC) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  // Mark callee saved registers as used so we will not choose them.
-  const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    LiveRegs.addReg(CSRegs[i]);
-
-  for (unsigned Reg : RC) {
-    if (LiveRegs.available(MRI, Reg))
-      return Reg;
-  }
-
-  return AMDGPU::NoRegister;
-}
-
 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   switch (ID) {
   case TargetStackID::Default:
@@ -559,6 +677,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
@@ -573,20 +692,90 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   bool HasFP = false;
   uint32_t NumBytes = MFI.getStackSize();
   uint32_t RoundedSize = NumBytes;
+  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+  // turn on all lanes before doing the spill to memory.
+  unsigned ScratchExecCopy = AMDGPU::NoRegister;
+
+  // Emit the copy if we need an FP, and are using a free SGPR to save it.
+  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
+      .addReg(FramePtrReg)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+         : FuncInfo->getSGPRSpillVGPRs()) {
+    if (!Reg.FI.hasValue())
+      continue;
+
+    if (ScratchExecCopy == AMDGPU::NoRegister) {
+      if (LiveRegs.empty()) {
+        LiveRegs.init(TRI);
+        LiveRegs.addLiveIns(MBB);
+        if (FuncInfo->SGPRForFPSaveRestoreCopy)
+          LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      }
+
+      ScratchExecCopy
+        = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
+                                           *TRI.getWaveMaskRegClass());
+      assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
+
+      const unsigned OrSaveExec = ST.isWave32() ?
+        AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
+              ScratchExecCopy)
+        .addImm(-1);
+    }
+
+    buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+                     FuncInfo->getScratchRSrcReg(),
+                     StackPtrReg,
+                     Reg.FI.getValue());
+  }
+
+  if (ScratchExecCopy != AMDGPU::NoRegister) {
+    // FIXME: Split block and make terminator.
+    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+      .addReg(ScratchExecCopy, RegState::Kill);
+    LiveRegs.addReg(ScratchExecCopy);
+  }
+
+
+  if (FuncInfo->FramePointerSaveIndex) {
+    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+    assert(!MFI.isDeadObjectIndex(FI) &&
+           MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+      = FuncInfo->getSGPRToVGPRSpills(FI);
+    assert(Spill.size() == 1);
+
+    // Save FP before setting it up.
+    // FIXME: This should respect spillSGPRToVGPR;
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            Spill[0].VGPR)
+      .addReg(FramePtrReg)
+      .addImm(Spill[0].Lane)
+      .addReg(Spill[0].VGPR, RegState::Undef);
+  }
 
   if (TRI.needsStackRealignment(MF)) {
     HasFP = true;
     const unsigned Alignment = MFI.getMaxAlignment();
 
     RoundedSize += Alignment;
+    if (LiveRegs.empty()) {
+      LiveRegs.init(TRI);
+      LiveRegs.addLiveIns(MBB);
+      LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+    }
 
-    LiveRegs.init(TRI);
-    LiveRegs.addLiveIns(MBB);
-
-    unsigned ScratchSPReg
-      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
-                                         AMDGPU::SReg_32_XM0RegClass);
-    assert(ScratchSPReg != AMDGPU::NoRegister);
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+    assert(ScratchSPReg != AMDGPU::NoRegister &&
+           ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
 
     // s_add_u32 tmp_reg, s32, NumBytes
     // s_and_b32 s32, tmp_reg, 0b111...0000
@@ -616,44 +805,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
-  // turn on all lanes before doing the spill to memory.
-  unsigned ScratchExecCopy = AMDGPU::NoRegister;
-
-  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
-         : FuncInfo->getSGPRSpillVGPRs()) {
-    if (!Reg.FI.hasValue())
-      continue;
-
-    if (ScratchExecCopy == AMDGPU::NoRegister) {
-      if (LiveRegs.empty()) {
-        LiveRegs.init(TRI);
-        LiveRegs.addLiveIns(MBB);
-      }
-
-      ScratchExecCopy
-        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
-                                           *TRI.getWaveMaskRegClass());
-
-      const unsigned OrSaveExec = ST.isWave32() ?
-        AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
-              ScratchExecCopy)
-        .addImm(-1);
-    }
-
-    TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
-                             Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
-                             &TII->getRegisterInfo());
-  }
+  assert(!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+                    FuncInfo->FramePointerSaveIndex) &&
+                       "Needed to save FP but didn't save it anywhere");
 
-  if (ScratchExecCopy != AMDGPU::NoRegister) {
-    // FIXME: Split block and make terminator.
-    unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
-      .addReg(ScratchExecCopy);
-  }
+  assert(HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+                   !FuncInfo->FramePointerSaveIndex) &&
+                      "Saved FP but didn't need it");
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -664,9 +822,45 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  LivePhysRegs LiveRegs;
   DebugLoc DL;
 
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+    NumBytes + MFI.getMaxAlignment() : NumBytes;
+
+  if (RoundedSize != 0 && hasFP(MF)) {
+    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(RoundedSize * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameDestroy);
+  }
+
+  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
+      .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  if (FuncInfo->FramePointerSaveIndex) {
+    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+    assert(!MFI.isDeadObjectIndex(FI));
+    assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+      = FuncInfo->getSGPRToVGPRSpills(FI);
+    assert(Spill.size() == 1);
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+            FuncInfo->getFrameOffsetReg())
+      .addReg(Spill[0].VGPR)
+      .addImm(Spill[0].Lane);
+  }
+
   unsigned ScratchExecCopy = AMDGPU::NoRegister;
   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
          : FuncInfo->getSGPRSpillVGPRs()) {
@@ -676,24 +870,26 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     if (ScratchExecCopy == AMDGPU::NoRegister) {
       // See emitPrologue
-      LivePhysRegs LiveRegs(*ST.getRegisterInfo());
-      LiveRegs.addLiveOuts(MBB);
-      LiveRegs.stepBackward(*MBBI);
+      if (LiveRegs.empty()) {
+        LiveRegs.init(*ST.getRegisterInfo());
+        LiveRegs.addLiveOuts(MBB);
+        LiveRegs.stepBackward(*MBBI);
+      }
 
-      ScratchExecCopy
-        = findScratchNonCalleeSaveRegister(MF, LiveRegs,
-                                           *TRI.getWaveMaskRegClass());
+      ScratchExecCopy = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+      LiveRegs.removeReg(ScratchExecCopy);
 
-      const unsigned OrSaveExec = ST.isWave32() ?
-      AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+      const unsigned OrSaveExec =
+          ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
 
       BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
         .addImm(-1);
     }
 
-    TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
-                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
-                              &TII->getRegisterInfo());
+    buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+                      FuncInfo->getScratchRSrcReg(),
+                      FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
   }
 
   if (ScratchExecCopy != AMDGPU::NoRegister) {
@@ -701,25 +897,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
-      .addReg(ScratchExecCopy);
-  }
-
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  uint32_t NumBytes = MFI.getStackSize();
-  uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
-    NumBytes + MFI.getMaxAlignment() : NumBytes;
-
-  if (RoundedSize != 0 && hasFP(MF)) {
-    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
-      .addReg(StackPtrReg)
-      .addImm(RoundedSize * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addReg(ScratchExecCopy, RegState::Kill);
   }
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory.
+// memory. They should have been removed by now.
 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
@@ -730,6 +913,23 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
   return true;
 }
 
+
+#ifndef NDEBUG
+static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
+                                 Optional<int> FramePointerSaveIndex) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I) &&
+        MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+        FramePointerSaveIndex && I != FramePointerSaveIndex) {
+      return false;
+    }
+  }
+
+  return true;
+}
+#endif
+
 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                             unsigned &FrameReg) const {
   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
@@ -743,15 +943,12 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   RegScavenger *RS) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  if (!MFI.hasStackObjects())
-    return;
-
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
-  FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+  assert(allSGPRSpillsAreDead(MFI, None) &&
+         "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
   // but currently hasNonSpillStackObjects is set only from source
@@ -761,12 +958,12 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
     if (FuncInfo->isEntryFunction()) {
       int ScavengeFI = MFI.CreateFixedObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+        TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
       RS->addScavengingFrameIndex(ScavengeFI);
     } else {
       int ScavengeFI = MFI.CreateStackObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
-        TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
+        TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+        TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
         false);
       RS->addScavengingFrameIndex(ScavengeFI);
     }
@@ -775,17 +972,76 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
 // Only report VGPRs to generic code.
 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
-                                           BitVector &SavedRegs,
+                                           BitVector &SavedVGPRs,
                                            RegScavenger *RS) const {
-  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
 
-  // VGPRs used for SGPR spilling need to be specially inserted in the prolog.
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // Ignore the SGPRs the default implementation found.
+  SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+
+  // hasFP only knows about stack objects that already exist. We're now
+  // determining the stack slots that will be created, so we have to predict
+  // them. Stack objects force FP usage with calls.
+  //
+  // Note a new VGPR CSR may be introduced if one is used for the spill, but we
+  // don't want to report it here.
+  //
+  // FIXME: Is this really hasReservedCallFrame?
+  const bool WillHaveFP =
+      FrameInfo.hasCalls() &&
+      (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+
+  // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
+  // so don't allow the default insertion to handle them.
   for (auto SSpill : MFI->getSGPRSpillVGPRs())
-    SavedRegs.reset(SSpill.VGPR);
+    SavedVGPRs.reset(SSpill.VGPR);
+
+  const bool HasFP = WillHaveFP || hasFP(MF);
+  if (!HasFP)
+    return;
+
+  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+                                                    TargetStackID::SGPRSpill);
+
+    // If there is already a VGPR with free lanes, use it. We may already have
+    // to pay the penalty for spilling a CSR VGPR.
+    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+      llvm_unreachable("allocate SGPR spill should have worked");
+
+    MFI->FramePointerSaveIndex = NewFI;
+
+    LLVM_DEBUG(
+      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+      dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
+             << ':' << Spill.Lane << '\n');
+    return;
+  }
+
+  MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
+
+  if (!MFI->SGPRForFPSaveRestoreCopy) {
+    // There's no free lane to spill, and no free register to save FP, so we're
+    // forced to spill another VGPR to use for the spill.
+    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+                                                    TargetStackID::SGPRSpill);
+    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+      llvm_unreachable("allocate SGPR spill should have worked");
+    MFI->FramePointerSaveIndex = NewFI;
+
+    LLVM_DEBUG(
+      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+      dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
+             << ':' << Spill.Lane << '\n';);
+  } else {
+    LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
+               printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+  }
 }
 
 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
@@ -802,6 +1058,27 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
 }
 
+bool SIFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return true; // Early exit if no callee saved registers are modified!
+
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+    return false;
+
+  for (auto &CS : CSI) {
+    if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
+      if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
+        CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      break;
+    }
+  }
+
+  return false;
+}
+
 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   MachineFunction &MF,
   MachineBasicBlock &MBB,
@@ -841,6 +1118,9 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
   if (MFI.hasCalls()) {
     // All offsets are unsigned, so need to be addressed in the same direction
     // as stack growth.
+
+    // FIXME: This function is pretty broken, since it can be called before the
+    // frame layout is determined or CSR spills are inserted.
     if (MFI.getStackSize() != 0)
       return true;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 3e4260f9ed4..19543287148 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -38,6 +38,10 @@ public:
                             RegScavenger *RS = nullptr) const override;
   void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
                                 RegScavenger *RS = nullptr) const;
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
 
   bool isSupportedStackID(TargetStackID::Value ID) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2360eaced01..0be746cf2b8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2694,8 +2694,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
-  SDValue CallerSavedFP;
-
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall) {
@@ -2708,15 +2706,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
     CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
-
-    if (!Info->isEntryFunction()) {
-      // Avoid clobbering this function's FP value. In the current convention
-      // callee will overwrite this, so do save/restore around the call site.
-      CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
-                                         Info->getFrameOffsetReg(), MVT::i32);
-      CopyFromChains.push_back(CallerSavedFP.getValue(1));
-    }
-
     Chain = DAG.getTokenFactor(DL, CopyFromChains);
   }
 
@@ -2905,12 +2894,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   Chain = Call.getValue(0);
   InFlag = Call.getValue(1);
 
-  if (CallerSavedFP) {
-    SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
-    Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   uint64_t CalleePopBytes = NumBytes;
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 12787f2ce9a..8605932330e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -957,8 +957,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // Add the scratch resource registers as implicit uses because we may end up
     // needing them, and need to ensure that the reserved registers are
     // correctly handled.
-
-    FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
+    if (RI.spillSGPRToVGPR())
+      FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -1052,7 +1052,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
 
-    FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
+    if (RI.spillSGPRToVGPR())
+      FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b73feadd521..3cbd4c3ae13 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -71,7 +71,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     // required for scratch access.
     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
     ScratchWaveOffsetReg = AMDGPU::SGPR33;
-    FrameOffsetReg = AMDGPU::SGPR5;
+
+    // TODO: Pick a high register, and shift down, similar to a kernel.wwwwwwwwwwww
+    FrameOffsetReg = AMDGPU::SGPR34;
     StackPtrOffsetReg = AMDGPU::SGPR32;
 
     ArgInfo.PrivateSegmentBuffer =
@@ -245,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
   return false;
 }
 
+/// \p returns true if \p NumLanes slots are available in VGPRs already used for
+/// SGPR spilling.
+//
+// FIXME: This only works after processFunctionBeforeFrameFinalized
+bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+                                                      unsigned NumNeed) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  unsigned WaveSize = ST.getWavefrontSize();
+  return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
+}
+
 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
                                                     int FI) {
@@ -307,13 +320,18 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
 }
 
 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
-  for (auto &R : SGPRToVGPRSpills)
-    MFI.RemoveStackObject(R.first);
-  // All other SPGRs must be allocated on the default stack, so reset
-  // the stack ID.
-  for (unsigned i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd();
-       i != e; ++i)
-    MFI.setStackID(i, 0);
+  // The FP spill hasn't been inserted yet, so keep it around.
+  for (auto &R : SGPRToVGPRSpills) {
+    if (R.first != FramePointerSaveIndex)
+      MFI.RemoveStackObject(R.first);
+  }
+
+  // All other SPGRs must be allocated on the default stack, so reset the stack
+  // ID.
+  for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+       ++i)
+    if (i != FramePointerSaveIndex)
+      MFI.setStackID(i, TargetStackID::Default);
 }
 
 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 73393634b31..2cbca8930a6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -429,6 +429,12 @@ private:
   unsigned NumVGPRSpillLanes = 0;
   SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
 
+public: // FIXME
+  /// If this is set, an SGPR used for save/restore of the register used for the
+  /// frame pointer.
+  unsigned SGPRForFPSaveRestoreCopy = 0;
+  Optional<int> FramePointerSaveIndex;
+
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
 
@@ -448,6 +454,8 @@ public:
     return Mode;
   }
 
+  bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+                                 unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-07-08 19:03:38 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-07-08 19:03:38 +0000
commit	71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4 (patch)
tree	160cce173c40a62b1aa78731c2d7894dcb142f22 /llvm/lib/Target
parent	ed499a36b67cf46cbf66052cfe374c80a595f1c1 (diff)
download	bcm5719-llvm-71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4.tar.gz bcm5719-llvm-71dfb7ec5c2ca8a433e05385047a36a9d4ef53e4.zip