summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp84
-rw-r--r--llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir72
2 files changed, 142 insertions, 14 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f49ca11d98f..e81eb5ea852 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
cl::desc("enable use of redzone on AArch64"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ ReverseCSRRestoreSeq("reverse-csr-restore-seq",
+ cl::desc("reverse the CSR restore sequence"),
+ cl::init(false), cl::Hidden);
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
/// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -844,14 +849,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ uint64_t AfterCSRPopSize = ArgumentPopSize;
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
-
- if (!CombineSPBump && PrologueSaveSize != 0)
- convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
+ // Assume we can't combine the last pop with the sp restore.
+
+ if (!CombineSPBump && PrologueSaveSize != 0) {
+ MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+ // Converting the last ldp to a post-index ldp is valid only if the last
+ // ldp's offset is 0.
+ const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
+ // If the offset is 0, convert it to a post-index ldp.
+ if (OffsetOp.getImm() == 0) {
+ convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
+ PrologueSaveSize);
+ } else {
+ // If not, make sure to emit an add after the last ldp.
+ // We're doing this by transfering the size to be restored from the
+ // adjustment *before* the CSR pops to the adjustment *after* the CSR
+ // pops.
+ AfterCSRPopSize += PrologueSaveSize;
+ }
+ }
// Move past the restores of the callee-saved registers.
+ // If we plan on combining the sp bump of the local stack size and the callee
+ // save stack size, we might need to adjust the CSR save and restore offsets.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
@@ -866,7 +889,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- NumBytes + ArgumentPopSize, TII,
+ NumBytes + AfterCSRPopSize, TII,
MachineInstr::FrameDestroy);
return;
}
@@ -878,18 +901,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
// stack pointer (but we may need to pop stack args for fastcc).
- if (RedZone && ArgumentPopSize == 0)
+ if (RedZone && AfterCSRPopSize == 0)
return;
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
int StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
- StackRestoreBytes += ArgumentPopSize;
+ StackRestoreBytes += AfterCSRPopSize;
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
// If we were able to combine the local stack pop with the argument pop,
// then we're done.
- if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
return;
NumBytes = 0;
}
@@ -909,9 +932,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
// code in the prologue.
- if (ArgumentPopSize)
+ if (AfterCSRPopSize) {
+ // Sometimes (when we restore in the same order as we save), we can end up
+ // with code like this:
+ //
+ // ldp x26, x25, [sp]
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // add sp, sp, #64
+ //
+ // In this case, it is always better to put the first ldp at the end, so
+ // that the load-store optimizer can run and merge the ldp and the add into
+ // a post-index ldp.
+ // If we managed to grab the first pop instruction, move it to the end.
+ if (LastPopI != Begin)
+ MBB.splice(MBB.getFirstTerminator(), &MBB, LastPopI);
+ // We should end up with something like this now:
+ //
+ // ldp x24, x23, [sp, #16]
+ // ldp x22, x21, [sp, #32]
+ // ldp x20, x19, [sp, #48]
+ // ldp x26, x25, [sp]
+ // add sp, sp, #64
+ //
+ // and the load-store optimizer can merge the last two instructions into:
+ //
+ // ldp x26, x25, [sp], #64
+ //
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- ArgumentPopSize, TII, MachineInstr::FrameDestroy);
+ AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
+ }
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1180,9 +1231,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
- for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
- ++RPII) {
- RegPairInfo RPI = *RPII;
+ auto EmitMI = [&](const RegPairInfo &RPI) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -1221,7 +1270,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
MachineMemOperand::MOLoad, 8, 8));
- }
+ };
+
+ if (ReverseCSRRestoreSeq)
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ EmitMI(RPI);
+ else
+ for (const RegPairInfo &RPI : RegPairs)
+ EmitMI(RPI);
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
new file mode 100644
index 00000000000..f0f6702992f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
@@ -0,0 +1,72 @@
+# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT
+# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT
+#
+--- |
+
+ define void @foo() nounwind { entry: unreachable }
+
+ define void @bar() nounwind { entry: unreachable }
+
+...
+---
+name: foo
+# CHECK-LABEL: name: foo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $x19 = IMPLICIT_DEF
+ $x20 = IMPLICIT_DEF
+ $x21 = IMPLICIT_DEF
+ $x22 = IMPLICIT_DEF
+ $x23 = IMPLICIT_DEF
+ $x24 = IMPLICIT_DEF
+ $x25 = IMPLICIT_DEF
+ $x26 = IMPLICIT_DEF
+
+ ; The local stack size is 0, so the last ldp in the sequence will also
+ ; restore the stack.
+ ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
+ ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
+ ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
+
+ ; Before running the load-store optimizer, we emit a ldp and an add.
+ ; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0
+ ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0
+
+ ; We want to make sure that after running the load-store optimizer, the ldp
+ ; and the add get merged into a post-index ldp.
+ ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
+
+ RET_ReallyLR
+...
+---
+name: bar
+# CHECK-LABEL: name: bar
+tracksRegLiveness: true
+stack:
+ - { id : 0, size: 8, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: -4, di-variable: '', di-expression: '', di-location: '' }
+
+body: |
+ bb.0:
+ $x19 = IMPLICIT_DEF
+ $x20 = IMPLICIT_DEF
+ $x21 = IMPLICIT_DEF
+ $x22 = IMPLICIT_DEF
+ $x23 = IMPLICIT_DEF
+ $x24 = IMPLICIT_DEF
+ $x25 = IMPLICIT_DEF
+ $x26 = IMPLICIT_DEF
+
+ ; The local stack size is not 0, and we can combine the CSR stack size with
+ ; the local stack size. This results in rewriting the offsets for all the
+ ; save/restores and forbids us to merge the stack adjustment and the last pop.
+ ; In this case, there is no point of moving the first CSR pair at the end.
+ ; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2
+ ; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4
+ ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
+ ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
+ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
+ RET_ReallyLR
+...
OpenPOWER on IntegriCloud