7 files changed, 105 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 79bae0aa1f0..f7e5cb03b3e 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -454,6 +454,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       .addImm(NumBytes * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameSetup);
   }
+
+  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+         : FuncInfo->getSGPRSpillVGPRs()) {
+    if (!Reg.FI.hasValue())
+      continue;
+    TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
+                             Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+                             &TII->getRegisterInfo());
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -462,6 +471,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FuncInfo->isEntryFunction())
     return;
 
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+         : FuncInfo->getSGPRSpillVGPRs()) {
+    if (!Reg.FI.hasValue())
+      continue;
+    TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
+                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+                              &TII->getRegisterInfo());
+  }
+
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   if (StackPtrReg == AMDGPU::NoRegister)
     return;
@@ -469,9 +491,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   uint32_t NumBytes = MFI.getStackSize();
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   DebugLoc DL;
 
   // FIXME: Clarify distinction between no set SP and SP. For callee functions,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 04e57bedb21..cfc9fe5fa51 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -237,6 +237,15 @@ unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI)
   return ImplicitBufferPtrUserSGPR;
 }
 
+static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
+  for (unsigned I = 0; CSRegs[I]; ++I) {
+    if (CSRegs[I] == Reg)
+      return true;
+  }
+
+  return false;
+}
+
 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
                                                     int FI) {
@@ -258,6 +267,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
 
   int NumLanes = Size / 4;
 
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
   // Make sure to handle the case where a wide SGPR spill may span between two
   // VGPRs.
   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
@@ -274,14 +285,21 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
         return false;
       }
 
-      SpillVGPRs.push_back(LaneVGPR);
+      Optional<int> CSRSpillFI;
+      if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
+        // TODO: Should this be a CreateSpillStackObject? This is technically a
+        // weird CSR spill.
+        CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
+      }
+
+      SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
 
       // Add this register as live-in to all blocks to avoid machine verifer
       // complaining about use of an undefined physical register.
       for (MachineBasicBlock &BB : MF)
         BB.addLiveIn(LaneVGPR);
     } else {
-      LaneVGPR = SpillVGPRs.back();
+      LaneVGPR = SpillVGPRs.back().VGPR;
     }
 
     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8511403ebc3..94145c46e10 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -211,6 +211,19 @@ public:
     bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
+  struct SGPRSpillVGPRCSR {
+    // VGPR used for SGPR spills
+    unsigned VGPR;
+
+    // If the VGPR is a CSR, the stack slot used to save/restore it in the
+    // prolog/epilog.
+    Optional<int> FI;
+
+    SGPRSpillVGPRCSR(unsigned V, Optional<int> F) :
+      VGPR(V),
+      FI(F) {}
+  };
+
 private:
   // SGPR->VGPR spilling support.
   typedef std::pair<unsigned, unsigned> SpillRegMask;
@@ -219,7 +232,7 @@ private:
   // frameindex key.
   DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
   unsigned NumVGPRSpillLanes = 0;
-  SmallVector<unsigned, 2> SpillVGPRs;
+  SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
 
 public:
 
@@ -231,6 +244,10 @@ public:
       ArrayRef<SpilledReg>() : makeArrayRef(I->second);
   }
 
+  ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const {
+    return SpillVGPRs;
+  }
+
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 44648dfc3c2..6ae5aabb5ca 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -30,10 +30,11 @@ entry:
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
 ; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v32
+; GCN-DAG: buffer_store_dword v32
+; GCN-DAG: buffer_store_dword v33
 ; GCN: v_writelane_b32
 
-; GCN-DAG: s_add_u32 s32, s32, 0xa00{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
 
 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
 ; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
@@ -48,7 +49,8 @@ entry:
 
 ; GCN: v_readlane_b32
 ; GCN: buffer_load_dword v32,
-; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
+; GCN: buffer_load_dword v33,
+; GCN: s_sub_u32 s32, s32, 0xb00{{$}}
 ; GCN: s_setpc_b64
 define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
 entry:
@@ -67,7 +69,7 @@ entry:
 
 ; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0xa00{{$}}
+; GCN: s_add_u32 s32, s32, 0xc00{{$}}
 ; GCN: v_writelane_b32
 
 ; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
@@ -103,7 +105,7 @@ entry:
 
 ; GCN: v_readlane_b32
 
-; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
+; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @call_void_func_byval_struct_func() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 60616639ea8..7c39831e73d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -146,7 +146,7 @@ define void @use_stack1() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_stack:
-; GCN: ScratchSize: 2120
+; GCN: ScratchSize: 2124
 define void @indirect_use_stack() #1 {
   %alloca = alloca [16 x i32], align 4
   call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0
@@ -156,7 +156,7 @@ define void @indirect_use_stack() #1 {
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_stack:
 ; GCN: is_dynamic_callstack = 0
-; GCN: ScratchSize: 2120
+; GCN: ScratchSize: 2124
 define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
   call void @indirect_use_stack()
   ret void
@@ -199,7 +199,7 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
 }
 
 ; GCN-LABEL: {{^}}direct_recursion_use_stack:
-; GCN: ScratchSize: 2052
+; GCN: ScratchSize: 2056
 define void @direct_recursion_use_stack(i32 %val) #2 {
   %alloca = alloca [512 x i32], align 4
   call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
@@ -218,7 +218,7 @@ ret:
 ; GCN-LABEL: {{^}}usage_direct_recursion:
 ; GCN: is_ptr64 = 1
 ; GCN: is_dynamic_callstack = 1
-; GCN: workitem_private_segment_byte_size = 2052
+; GCN: workitem_private_segment_byte_size = 2056
 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
   call void @direct_recursion_use_stack(i32 %n)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index a07199c1a09..d0edcf8fcbb 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -36,14 +36,15 @@ define void @callee_with_stack() #0 {
 ; GCN-LABEL: {{^}}callee_with_stack_and_call:
 ; GCN: ; BB#0:
 ; GCN-NEXT: s_waitcnt
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
 
-; GCN-DAG: s_mov_b32 s5, s32
 ; GCN-DAG: v_writelane_b32 v32, s33,
 ; GCN-DAG: v_writelane_b32 v32, s34,
 ; GCN-DAG: v_writelane_b32 v32, s35,
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
-; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: s_mov_b32 s33, s5
 
 
@@ -52,6 +53,7 @@ define void @callee_with_stack() #0 {
 ; GCN-DAG: v_readlane_b32 s35,
 ; GCN-DAG: v_readlane_b32 s34,
 ; GCN-DAG: v_readlane_b32 s33,
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_and_call() #0 {
@@ -64,13 +66,24 @@ define void @callee_with_stack_and_call() #0 {
 ; Should be able to copy incoming stack pointer directly to inner
 ; call's stack pointer argument.
 
+; There is stack usage only because of the need to evict a VGPR for
+; spilling CSR SGPRs.
+
 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
 ; GCN: s_waitcnt
-; GCN-NOT: s32
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-DAG: v_writelane_b32 v32, s33, 0
+; GCN-DAG: v_writelane_b32 v32, s34, 1
 ; GCN: s_mov_b32 s33, s5
 ; GCN: s_swappc_b64
 ; GCN: s_mov_b32 s5, s33
-; GCN-NOT: s32
+
+; GCN-DAG: v_readlane_b32 s34, v32, 1
+; GCN-DAG: v_readlane_b32 s33, v32, 0
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_sub_u32 s32, s32, 0x200
+
 ; GCN: s_setpc_b64
 define void @callee_no_stack_with_call() #0 {
   call void @external_void_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 2d8d666a26b..f8ce8186e45 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -9,9 +9,21 @@ declare void @external_void_func_i32(i32) #0
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
 ; GCN: s_waitcnt
-; GCN-NOT: s32
+; GCN: s_mov_b32 s5, s32
+; Spill CSR VGPR used for SGPR spilling
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-DAG: s_add_u32 s32, s32, 0x200
+; GCN-DAG: v_writelane_b32 v32, s33, 0
+; GCN-DAG: v_writelane_b32 v32, s34, 1
+; GCN-DAG: v_writelane_b32 v32, s35, 2
+
 ; GCN: s_swappc_b64
-; GCN-NOT: s32
+
+; GCN: v_readlane_b32 s35, v32, 2
+; GCN: v_readlane_b32 s34, v32, 1
+; GCN: v_readlane_b32 s33, v32, 0
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_sub_u32 s32, s32, 0x200
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm() #0 {
   call void @external_void_func_i32(i32 42)
@@ -21,10 +33,10 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x1100{{$}}
+; GCN: s_add_u32 s32, s32, 0x1200{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
 ; GCN: s_swappc_b64
-; GCN: s_sub_u32 s32, s32, 0x1100{{$}}
+; GCN: s_sub_u32 s32, s32, 0x1200{{$}}
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
   %alloca = alloca [16 x i32], align 4