summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2016-10-06 16:20:41 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2016-10-06 16:20:41 +0000
commit6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch)
tree255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parentd391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff)
downloadbcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz
bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip
BranchRelaxation: Support expanding unconditional branches
AMDGPU needs to expand unconditional branches in a new block with an indirect branch. llvm-svn: 283464
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp187
1 files changed, 178 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index da725dae47c..ce41d82bbbf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -28,6 +28,13 @@
using namespace llvm;
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+ cl::desc("Restrict range of branch instructions (DEBUG)"));
+
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
: AMDGPUInstrInfo(ST), RI(), ST(ST) {}
@@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
}
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+ int64_t BrOffset) const {
+ // BranchRelaxation should never have to check s_setpc_b64 because its dest
+ // block is unanalyzable.
+ assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+ // Convert to dwords.
+ BrOffset /= 4;
+
+ // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+ // from the next instruction.
+ BrOffset -= 1;
+
+ return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+ const MachineInstr &MI) const {
+ if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+ // This would be a difficult analysis to perform, but can always be legal so
+ // there's no need to analyze it.
+ return nullptr;
+ }
+
+ return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock &DestBB,
+ const DebugLoc &DL,
+ int64_t BrOffset,
+ RegScavenger *RS) const {
+ assert(RS && "RegScavenger required for long branching");
+ assert(MBB.empty() &&
+ "new block should be inserted for expanding unconditional branch");
+ assert(MBB.pred_size() == 1);
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // FIXME: Virtual register workaround for RegScavenger not working with empty
+ // blocks.
+ unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ auto I = MBB.end();
+
+ // We need to compute the offset relative to the instruction immediately after
+ // s_getpc_b64. Insert pc arithmetic code before last terminator.
+ MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+ // TODO: Handle > 32-bit block address.
+ if (BrOffset >= 0) {
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ } else {
+ // Backwards branch.
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+ .addReg(PCReg, 0, AMDGPU::sub0)
+ .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+ .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+ .addReg(PCReg, 0, AMDGPU::sub1)
+ .addImm(0);
+ }
+
+ // Insert the indirect branch after the other terminator.
+ BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+ .addReg(PCReg);
+
+ // FIXME: If spilling is necessary, this will fail because this scavenger has
+ // no emergency stack slots. It is non-trivial to spill in this situation,
+ // because the restore code needs to be specially placed after the
+ // jump. BranchRelaxation then needs to be made aware of the newly inserted
+ // block.
+ //
+ // If a spill is needed for the pc register pair, we need to insert a spill
+ // restore block right before the destination block, and insert a short branch
+ // into the old destination block's fallthrough predecessor.
+ // e.g.:
+ //
+ // s_cbranch_scc0 skip_long_branch:
+ //
+ // long_branch_bb:
+ // spill s[8:9]
+ // s_getpc_b64 s[8:9]
+ // s_add_u32 s8, s8, restore_bb
+ // s_addc_u32 s9, s9, 0
+ // s_setpc_b64 s[8:9]
+ //
+ // skip_long_branch:
+ // foo;
+ //
+ // .....
+ //
+ // dest_bb_fallthrough_predecessor:
+ // bar;
+ // s_branch dest_bb
+ //
+ // restore_bb:
+ // restore s[8:9]
+ // fallthrough dest_bb
+ ///
+ // dest_bb:
+ // buzz;
+
+ RS->enterBasicBlockEnd(MBB);
+ unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+ MachineBasicBlock::iterator(GetPC), 0);
+ MRI.replaceRegWith(PCReg, Scav);
+ MRI.clearVirtRegs();
+ RS->setRegUsed(Scav);
+
+ return 4 + 8 + 4 + 4;
+}
+
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
switch (Cond) {
case SIInstrInfo::SCC_TRUE:
@@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
}
}
-bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
- MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-
- if (I == MBB.end())
- return false;
-
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
if (I->getOpcode() == AMDGPU::S_BRANCH) {
// Unconditional Branch
TBB = I->getOperand(0).getMBB();
@@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
return true;
}
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+ if (I == MBB.end())
+ return false;
+
+ if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+ return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+ ++I;
+
+ // TODO: Should be able to treat as fallthrough?
+ if (I == MBB.end())
+ return true;
+
+ if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+ return true;
+
+ MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+ // Specifically handle the case where the conditional branch is to the same
+ // destination as the mask branch. e.g.
+ //
+ // si_mask_branch BB8
+ // s_cbranch_execz BB8
+ // s_cbranch BB9
+ //
+ // This is required to understand divergent loops which may need the branches
+ // to be relaxed.
+ if (TBB != MaskBrDest || Cond.empty())
+ return true;
+
+ auto Pred = Cond[0].getImm();
+ return (Pred != EXECZ && Pred != EXECNZ);
+}
+
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
@@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
unsigned RemovedSize = 0;
while (I != MBB.end()) {
MachineBasicBlock::iterator Next = std::next(I);
+ if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+ I = Next;
+ continue;
+ }
+
RemovedSize += getInstSizeInBytes(*I);
I->eraseFromParent();
++Count;
OpenPOWER on IntegriCloud