summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorEvgenii Stepanov <eugenis@google.com>2019-09-24 17:03:18 -0700
committerEvgenii Stepanov <eugenis@google.com>2020-01-08 11:02:03 -0800
commitb675a7628ce6a21b1e4a71c079a67badfb8b073d (patch)
tree782914626e3d3bceeffe45a783525be95ce0c6b5 /llvm/lib/Target
parentba181d0063e43fb56938555112ab859f48aee287 (diff)
downloadbcm5719-llvm-b675a7628ce6a21b1e4a71c079a67badfb8b073d.tar.gz
bcm5719-llvm-b675a7628ce6a21b1e4a71c079a67badfb8b073d.zip
Merge memtag instructions with adjacent stack slots.
Summary: Detect a run of memory tagging instructions for adjacent stack frame slots, and replace them with a shorter instruction sequence * replace STG + STG with ST2G * replace STGloop + STGloop with STGloop This code needs to run when stack slot offsets are already known, but before FrameIndex operands in STG instructions are eliminated; that's the reason for the new hook in PrologueEpilogue. This change modifies STGloop and STZGloop pseudos to take the size as an immediate integer operand, and base address as a FI operand when possible. This is needed to simplify recognizing an STGloop instruction as operating on a stack slot post-regalloc. This improves memtag code size by ~0.25%, and it looks like an additional ~0.1% is possible by rearranging the stack frame such that consecutive STG instructions reference adjacent slots (patch pending). Reviewers: pcc, ostannard Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70286
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp24
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp436
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h6
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp21
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp18
7 files changed, 489 insertions, 30 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 3b8f8a19fe4..97162ae2218 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -349,22 +349,38 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
- Register SizeReg = MI.getOperand(2).getReg();
- Register AddressReg = MI.getOperand(3).getReg();
+ Register SizeReg = MI.getOperand(0).getReg();
+ Register AddressReg = MI.getOperand(1).getReg();
MachineFunction *MF = MBB.getParent();
bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
- const unsigned OpCode =
+ const unsigned OpCode1 =
+ ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
+ const unsigned OpCode2 =
ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
+ unsigned Size = MI.getOperand(2).getImm();
+ assert(Size > 0 && Size % 16 == 0);
+ if (Size % (16 * 2) != 0) {
+ BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
+ .addReg(AddressReg)
+ .addReg(AddressReg)
+ .addImm(1);
+ Size -= 16;
+ }
+ MachineBasicBlock::iterator I =
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
+ .addImm(Size);
+ expandMOVImm(MBB, I, 64);
+
auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoopBB);
MF->insert(++LoopBB->getIterator(), DoneBB);
- BuildMI(LoopBB, DL, TII->get(OpCode))
+ BuildMI(LoopBB, DL, TII->get(OpCode2))
.addDef(AddressReg)
.addReg(AddressReg)
.addReg(AddressReg)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c732106014e..39d32863f15 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,6 +170,11 @@ static cl::opt<bool>
cl::desc("reverse the CSR restore sequence"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> StackTaggingMergeSetTag(
+ "stack-tagging-merge-settag",
+ cl::desc("merge settag instruction in function epilog"), cl::init(true),
+ cl::Hidden);
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
/// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -480,6 +485,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
return true;
}
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
+ MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+ if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
+ return false;
+
+ if (MBB.empty())
+ return true;
+
+ // Disable combined SP bump if the last instruction is an MTE tag store. It
+ // is almost always better to merge SP adjustment into those instructions.
+ MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator Begin = MBB.begin();
+ while (LastI != Begin) {
+ --LastI;
+ if (LastI->isTransient())
+ continue;
+ if (!LastI->getFlag(MachineInstr::FrameDestroy))
+ break;
+ }
+ switch (LastI->getOpcode()) {
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ return false;
+ default:
+ return true;
+ }
+ llvm_unreachable("unreachable");
+}
+
// Given a load or a store instruction, generate an appropriate unwinding SEH
// code on Windows.
static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -1459,7 +1497,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// function.
if (MF.hasEHFunclets())
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
- bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+ bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -2637,9 +2675,399 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
.addImm(0);
}
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
-/// the update. This is easily retrieved as it is exactly the offset that is set
-/// in processFunctionBeforeFrameFinalized.
+namespace {
+struct TagStoreInstr {
+ MachineInstr *MI;
+ int64_t Offset, Size;
+ explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
+ : MI(MI), Offset(Offset), Size(Size) {}
+};
+
+class TagStoreEdit {
+ MachineFunction *MF;
+ MachineBasicBlock *MBB;
+ MachineRegisterInfo *MRI;
+ // Tag store instructions that are being replaced.
+ SmallVector<TagStoreInstr, 8> TagStores;
+ // Combined memref arguments of the above instructions.
+ SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
+
+ // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
+ // FrameRegOffset + Size) with the address tag of SP.
+ Register FrameReg;
+ StackOffset FrameRegOffset;
+ int64_t Size;
+ // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
+ Optional<int64_t> FrameRegUpdate;
+ // MIFlags for any FrameReg updating instructions.
+ unsigned FrameRegUpdateFlags;
+
+ // Use zeroing instruction variants.
+ bool ZeroData;
+ DebugLoc DL;
+
+ void emitUnrolled(MachineBasicBlock::iterator InsertI);
+ void emitLoop(MachineBasicBlock::iterator InsertI);
+
+public:
+ TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
+ : MBB(MBB), ZeroData(ZeroData) {
+ MF = MBB->getParent();
+ MRI = &MF->getRegInfo();
+ }
+ // Add an instruction to be replaced. Instructions must be added in the
+ // ascending order of Offset, and have to be adjacent.
+ void addInstruction(TagStoreInstr I) {
+ assert((TagStores.empty() ||
+ TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
+ "Non-adjacent tag store instructions.");
+ TagStores.push_back(I);
+ }
+ void clear() { TagStores.clear(); }
+ // Emit equivalent code at the given location, and erase the current set of
+ // instructions. May skip if the replacement is not profitable. May invalidate
+ // the input iterator and replace it with a valid one.
+ void emitCode(MachineBasicBlock::iterator &InsertI,
+ const AArch64FrameLowering *TFI, bool IsLast);
+};
+
+void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ const int64_t kMinOffset = -256 * 16;
+ const int64_t kMaxOffset = 255 * 16;
+
+ Register BaseReg = FrameReg;
+ int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+ if (BaseRegOffsetBytes < kMinOffset ||
+ BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
+ Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
+ {BaseRegOffsetBytes, MVT::i8}, TII);
+ BaseReg = ScratchReg;
+ BaseRegOffsetBytes = 0;
+ }
+
+ MachineInstr *LastI = nullptr;
+ while (Size) {
+ int64_t InstrSize = (Size > 16) ? 32 : 16;
+ unsigned Opcode =
+ InstrSize == 16
+ ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
+ : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
+ MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
+ .addReg(AArch64::SP)
+ .addReg(BaseReg)
+ .addImm(BaseRegOffsetBytes / 16)
+ .setMemRefs(CombinedMemRefs);
+ // A store to [BaseReg, #0] should go last for an opportunity to fold the
+ // final SP adjustment in the epilogue.
+ if (BaseRegOffsetBytes == 0)
+ LastI = I;
+ BaseRegOffsetBytes += InstrSize;
+ Size -= InstrSize;
+ }
+
+ if (LastI)
+ MBB->splice(InsertI, MBB, LastI);
+}
+
+void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ Register BaseReg = FrameRegUpdate
+ ? FrameReg
+ : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+ emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
+
+ int64_t LoopSize = Size;
+ // If the loop size is not a multiple of 32, split off one 16-byte store at
+ // the end to fold BaseReg update into.
+ if (FrameRegUpdate && *FrameRegUpdate)
+ LoopSize -= LoopSize % 32;
+ MachineInstr *LoopI =
+ BuildMI(*MBB, InsertI, DL,
+ TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop))
+ .addDef(SizeReg)
+ .addDef(BaseReg)
+ .addImm(LoopSize)
+ .addReg(BaseReg)
+ .setMemRefs(CombinedMemRefs);
+ if (FrameRegUpdate)
+ LoopI->setFlags(FrameRegUpdateFlags);
+
+ int64_t ExtraBaseRegUpdate =
+ FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+ if (LoopSize < Size) {
+ assert(FrameRegUpdate);
+ assert(Size - LoopSize == 16);
+ // Tag 16 more bytes at BaseReg and update BaseReg.
+ BuildMI(*MBB, InsertI, DL,
+ TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
+ .addDef(BaseReg)
+ .addReg(BaseReg)
+ .addReg(BaseReg)
+ .addImm(1 + ExtraBaseRegUpdate / 16)
+ .setMemRefs(CombinedMemRefs)
+ .setMIFlags(FrameRegUpdateFlags);
+ } else if (ExtraBaseRegUpdate) {
+ // Update BaseReg.
+ BuildMI(
+ *MBB, InsertI, DL,
+ TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
+ .addDef(BaseReg)
+ .addReg(BaseReg)
+ .addImm(std::abs(ExtraBaseRegUpdate))
+ .addImm(0)
+ .setMIFlags(FrameRegUpdateFlags);
+ }
+}
+
+// Check if *II is a register update that can be merged into STGloop that ends
+// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
+// end of the loop.
+bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
+ int64_t Size, int64_t *TotalOffset) {
+ MachineInstr &MI = *II;
+ if ((MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::SUBXri) &&
+ MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
+ unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
+ int64_t Offset = MI.getOperand(2).getImm() << Shift;
+ if (MI.getOpcode() == AArch64::SUBXri)
+ Offset = -Offset;
+ int64_t AbsPostOffset = std::abs(Offset - Size);
+ const int64_t kMaxOffset =
+ 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
+ if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
+ *TotalOffset = Offset;
+ return true;
+ }
+ }
+ return false;
+}
+
+void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
+ SmallVectorImpl<MachineMemOperand *> &MemRefs) {
+ MemRefs.clear();
+ for (auto &TS : TSE) {
+ MachineInstr *MI = TS.MI;
+ // An instruction without memory operands may access anything. Be
+ // conservative and return an empty list.
+ if (MI->memoperands_empty()) {
+ MemRefs.clear();
+ return;
+ }
+ MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
+ }
+}
+
+void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
+ const AArch64FrameLowering *TFI, bool IsLast) {
+ if (TagStores.empty())
+ return;
+ TagStoreInstr &FirstTagStore = TagStores[0];
+ TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
+ Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
+ DL = TagStores[0].MI->getDebugLoc();
+
+ unsigned Reg;
+ FrameRegOffset = TFI->resolveFrameOffsetReference(
+ *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+ /*PreferFP=*/false, /*ForSimm=*/true);
+ FrameReg = Reg;
+ FrameRegUpdate = None;
+
+ mergeMemRefs(TagStores, CombinedMemRefs);
+
+ LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
+ for (const auto &Instr
+ : TagStores) { dbgs() << " " << *Instr.MI; });
+
+ // Size threshold where a loop becomes shorter than a linear sequence of
+ // tagging instructions.
+ const int kSetTagLoopThreshold = 176;
+ if (Size < kSetTagLoopThreshold) {
+ if (TagStores.size() < 2)
+ return;
+ emitUnrolled(InsertI);
+ } else {
+ MachineInstr *UpdateInstr = nullptr;
+ int64_t TotalOffset;
+ if (IsLast) {
+ // See if we can merge base register update into the STGloop.
+ // This is done in AArch64LoadStoreOptimizer for "normal" stores,
+ // but STGloop is way too unusual for that, and also it only
+ // realistically happens in function epilogue. Also, STGloop is expanded
+ // before that pass.
+ if (InsertI != MBB->end() &&
+ canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+ &TotalOffset)) {
+ UpdateInstr = &*InsertI++;
+ LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
+ << *UpdateInstr);
+ }
+ }
+
+ if (!UpdateInstr && TagStores.size() < 2)
+ return;
+
+ if (UpdateInstr) {
+ FrameRegUpdate = TotalOffset;
+ FrameRegUpdateFlags = UpdateInstr->getFlags();
+ }
+ emitLoop(InsertI);
+ if (UpdateInstr)
+ UpdateInstr->eraseFromParent();
+ }
+
+ for (auto &TS : TagStores)
+ TS.MI->eraseFromParent();
+}
+
+bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
+ int64_t &Size, bool &ZeroData) {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ unsigned Opcode = MI.getOpcode();
+ ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
+ Opcode == AArch64::STZ2GOffset);
+
+ if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
+ if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
+ return false;
+ if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
+ return false;
+ Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
+ Size = MI.getOperand(2).getImm();
+ return true;
+ }
+
+ if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
+ Size = 16;
+ else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
+ Size = 32;
+ else
+ return false;
+
+ if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
+ return false;
+
+ Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
+ 16 * MI.getOperand(2).getImm();
+ return true;
+}
+
+// Detect a run of memory tagging instructions for adjacent stack frame slots,
+// and replace them with a shorter instruction sequence:
+// * replace STG + STG with ST2G
+// * replace STGloop + STGloop with STGloop
+// This code needs to run when stack slot offsets are already known, but before
+// FrameIndex operands in STG instructions are eliminated.
+MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI,
+ RegScavenger *RS) {
+ bool FirstZeroData;
+ int64_t Size, Offset;
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::iterator NextI = ++II;
+ if (&MI == &MBB->instr_back())
+ return II;
+ if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
+ return II;
+
+ SmallVector<TagStoreInstr, 4> Instrs;
+ Instrs.emplace_back(&MI, Offset, Size);
+
+ constexpr int kScanLimit = 10;
+ int Count = 0;
+ for (MachineBasicBlock::iterator E = MBB->end();
+ NextI != E && Count < kScanLimit; ++NextI) {
+ MachineInstr &MI = *NextI;
+ bool ZeroData;
+ int64_t Size, Offset;
+ // Collect instructions that update memory tags with a FrameIndex operand
+ // and (when applicable) constant size, and whose output registers are dead
+ // (the latter is almost always the case in practice). Since these
+ // instructions effectively have no inputs or outputs, we are free to skip
+ // any non-aliasing instructions in between without tracking used registers.
+ if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
+ if (ZeroData != FirstZeroData)
+ break;
+ Instrs.emplace_back(&MI, Offset, Size);
+ continue;
+ }
+
+ // Only count non-transient, non-tagging instructions toward the scan
+ // limit.
+ if (!MI.isTransient())
+ ++Count;
+
+ // Just in case, stop before the epilogue code starts.
+ if (MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy))
+ break;
+
+ // Reject anything that may alias the collected instructions.
+ if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+ break;
+ }
+
+ // New code will be inserted after the last tagging instruction we've found.
+ MachineBasicBlock::iterator InsertI = Instrs.back().MI;
+ InsertI++;
+
+ llvm::stable_sort(Instrs,
+ [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
+ return Left.Offset < Right.Offset;
+ });
+
+ // Make sure that we don't have any overlapping stores.
+ int64_t CurOffset = Instrs[0].Offset;
+ for (auto &Instr : Instrs) {
+ if (CurOffset > Instr.Offset)
+ return NextI;
+ CurOffset = Instr.Offset + Instr.Size;
+ }
+
+ // Find contiguous runs of tagged memory and emit shorter instruction
+ // sequencies for them when possible.
+ TagStoreEdit TSE(MBB, FirstZeroData);
+ Optional<int64_t> EndOffset;
+ for (auto &Instr : Instrs) {
+ if (EndOffset && *EndOffset != Instr.Offset) {
+ // Found a gap.
+ TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+ TSE.clear();
+ }
+
+ TSE.addInstruction(Instr);
+ EndOffset = Instr.Offset + Instr.Size;
+ }
+
+ TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+
+ return InsertI;
+}
+} // namespace
+
+void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS = nullptr) const {
+ if (StackTaggingMergeSetTag)
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ II = tryMergeAdjacentSTG(II, this, RS);
+}
+
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
+/// before the update. This is easily retrieved as it is exactly the offset
+/// that is set in processFunctionBeforeFrameFinalized.
int AArch64FrameLowering::getFrameIndexReferencePreferSP(
const MachineFunction &MF, int FI, unsigned &FrameReg,
bool IgnoreSPUpdates) const {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index b5719feb6b1..57a7924fb8f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -77,6 +77,10 @@ public:
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
+ void
+ processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
@@ -107,6 +111,8 @@ private:
int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
int &MinCSFrameIndex,
int &MaxCSFrameIndex) const;
+ bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
+ unsigned StackBumpBytes) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 54f3f7c1013..0ed2a678c4f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3458,6 +3458,8 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
case AArch64::ST1Fourv1d:
case AArch64::IRG:
case AArch64::IRGstack:
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
return AArch64FrameOffsetCannotUpdate;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f4d340c9f06..04a23f31ffd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1514,17 +1514,17 @@ def TAGPstack
// register / expression for the tagged base pointer of the current function.
def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
-// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range.
+// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range. $Rm is the loop counter.
let isCodeGenOnly=1, mayStore=1 in {
def STGloop
- : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
- [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
Sched<[WriteAdr, WriteST]>;
def STZGloop
- : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
- [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
Sched<[WriteAdr, WriteST]>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 14f839cd4f8..4a3778a2fd0 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -390,6 +390,10 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
return false;
+ // If even offset 0 is illegal, we don't want a virtual base register.
+ if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
+ return false;
+
// The offset likely isn't legal; we want to allocate a virtual base register.
return true;
}
@@ -445,6 +449,17 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
(void)Done;
}
+static Register getScratchRegisterForInstruction(MachineInstr &MI) {
+ // ST*Gloop can only have #fi in op3, and they have a constraint that
+ // op1==op3. Since op1 is early-clobber, it may (and also must) be used as the
+ // scratch register.
+ if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop)
+ return MI.getOperand(1).getReg();
+ else
+ return MI.getMF()->getRegInfo().createVirtualRegister(
+ &AArch64::GPR64RegClass);
+}
+
void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
@@ -501,8 +516,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// in a scratch register.
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
- Register ScratchReg =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Register ScratchReg = getScratchRegisterForInstruction(MI);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
TII);
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
@@ -531,8 +545,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
- Register ScratchReg =
- MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Register ScratchReg = getScratchRegisterForInstruction(MI);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
}
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index ba61ed726e8..e050a0028ec 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -125,19 +125,13 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
ZeroData);
- if (ObjSize % 32 != 0) {
- SDNode *St1 = DAG.getMachineNode(
- ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
- {MVT::i64, MVT::Other},
- {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
- DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
- ObjSize -= 16;
- Addr = SDValue(St1, 0);
- Chain = SDValue(St1, 1);
- }
-
const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
- SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+
+ if (Addr.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
+ Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+ }
+ SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
SDNode *St = DAG.getMachineNode(
ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
OpenPOWER on IntegriCloud