summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp82
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h9
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-realign.ll125
4 files changed, 210 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 9e35c5f7340..d64e6555d03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -587,6 +587,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a3f107b10fc..2eaace44c41 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -13,6 +13,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -487,9 +488,43 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
}
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+ MachineFunction *MF = MBB.getParent();
+
+ const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+ const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
return;
@@ -498,6 +533,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const MachineFrameInfo &MFI = MF.getFrameInfo();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -505,8 +541,36 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
+ // XXX - Is this the right predicate?
+
bool NeedFP = hasFP(MF);
- if (NeedFP) {
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = NumBytes;
+ const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+ if (NeedsRealignment) {
+ assert(NeedFP);
+ const unsigned Alignment = MFI.getMaxAlignment();
+ const unsigned ZeroLowBits = countTrailingZeros(Alignment);
+ assert(ZeroLowBits > 1);
+
+ RoundedSize += Alignment;
+
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+ assert(ScratchSPReg != AMDGPU::NoRegister);
+
+ // s_add_u32 tmp_reg, s32, NumBytes
+ // s_and_b32 s32, tmp_reg, 0b111...0000
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+ .addReg(StackPtrReg)
+ .addImm((Alignment - 1) * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+ .addReg(ScratchSPReg, RegState::Kill)
+ .addImm(-Alignment * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ FuncInfo->setIsStackRealigned(true);
+ } else if (NeedFP) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@@ -516,11 +580,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- uint32_t NumBytes = MFI.getStackSize();
- if (NumBytes != 0 && hasSP(MF)) {
+ if (RoundedSize != 0 && hasSP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
+ .addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
@@ -566,10 +629,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// it's really whether we need SP to be accurate or not.
if (NumBytes != 0 && hasSP(MF)) {
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(NumBytes * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameDestroy);
+ .addImm(RoundedSize * ST.getWavefrontSize());
}
}
@@ -759,7 +824,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
}
bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+ const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
// All stack operations are relative to the frame offset SGPR.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.hasCalls() || MFI.hasVarSizedObjects();
+ return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 781eea0f94b..8c38cdae5d9 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -142,6 +142,7 @@ private:
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
+ bool IsStackRealigned = false;
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
@@ -495,6 +496,14 @@ public:
HasNonSpillStackObjects = StackObject;
}
+ bool isStackRealigned() const {
+ return IsStackRealigned;
+ }
+
+ void setIsStackRealigned(bool Realigned = true) {
+ IsStackRealigned = Realigned;
+ }
+
unsigned getNumSpilledSGPRs() const {
return NumSpilledSGPRs;
}
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
new file mode 100644
index 00000000000..99a218b5325
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that we properly realign the stack. While 4-byte access is all
+; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
+
+
+; 128 byte object
+; 4 byte emergency stack slot
+; = 144 bytes with padding between them
+
+; GCN-LABEL: {{^}}needs_align16_default_stack_align:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN-NOT: s32
+
+; GCN: ; ScratchSize: 144
+define void @needs_align16_default_stack_align(i32 %idx) #0 {
+ %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+ %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+ store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}needs_align16_stack_align4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffffc00
+; GCN: s_add_u32 s32, s32, 0x2800{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x2800
+
+; GCN: ; ScratchSize: 160
+define void @needs_align16_stack_align4(i32 %idx) #2 {
+ %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+ %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+ store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}needs_align32:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffff800
+; GCN: s_add_u32 s32, s32, 0x3000{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x3000
+
+; GCN: ; ScratchSize: 192
+define void @needs_align32(i32 %idx) #0 {
+ %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
+ %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+ store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 32
+ ret void
+}
+
+; GCN-LABEL: {{^}}force_realign4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
+; GCN: s_and_b32 s5, s6, 0xffffff00
+; GCN: s_add_u32 s32, s32, 0xd00{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: s_sub_u32 s32, s32, 0xd00
+
+; GCN: ; ScratchSize: 52
+define void @force_realign4(i32 %idx) #1 {
+ %alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
+ %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+ store volatile i32 3, i32 addrspace(5)* %gep0, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align16_from_8:
+; GCN: s_add_u32 s32, s8, 0x400{{$}}
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 2, i32 addrspace(5)* %alloca
+ call void @needs_align16_default_stack_align(i32 1)
+ ret void
+}
+
+; The call sequence should keep the stack on call aligned to 4
+; GCN-LABEL: {{^}}kernel_call_align16_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_5() {
+ %alloca0 = alloca i8, align 1, addrspace(5)
+ store volatile i8 2, i8 addrspace(5)* %alloca0
+
+ call void @needs_align16_default_stack_align(i32 1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align4_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align4_from_5() {
+ %alloca0 = alloca i8, align 1, addrspace(5)
+ store volatile i8 2, i8 addrspace(5)* %alloca0
+
+ call void @needs_align16_stack_align4(i32 1)
+ ret void
+}
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { noinline nounwind "stackrealign" }
+attributes #2 = { noinline nounwind alignstack=4 }
OpenPOWER on IntegriCloud