diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-10-06 16:20:41 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-10-06 16:20:41 +0000 |
commit | 6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch) | |
tree | 255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | |
parent | d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff) | |
download | bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip |
BranchRelaxation: Support expanding unconditional branches
AMDGPU needs to expand unconditional branches in a new
block with an indirect branch.
llvm-svn: 283464
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 187 |
1 files changed, 178 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index da725dae47c..ce41d82bbbf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -28,6 +28,13 @@ using namespace llvm; +// Must be at least 4 to be able to branch over minimum unconditional branch +// code. This is only for making it possible to write reasonably small tests for +// long branches. +static cl::opt<unsigned> +BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), + cl::desc("Restrict range of branch instructions (DEBUG)")); + SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} @@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + // BranchRelaxation should never have to check s_setpc_b64 because its dest + // block is unanalyzable. + assert(BranchOp != AMDGPU::S_SETPC_B64); + + // Convert to dwords. + BrOffset /= 4; + + // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is + // from the next instruction. + BrOffset -= 1; + + return isIntN(BranchOffsetBits, BrOffset); +} + +MachineBasicBlock *SIInstrInfo::getBranchDestBlock( + const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { + // This would be a difficult analysis to perform, but can always be legal so + // there's no need to analyze it. + return nullptr; + } + + return MI.getOperand(0).getMBB(); +} + +unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &DestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS) const { + assert(RS && "RegScavenger required for long branching"); + assert(MBB.empty() && + "new block should be inserted for expanding unconditional branch"); + assert(MBB.pred_size() == 1); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // FIXME: Virtual register workaround for RegScavenger not working with empty + // blocks. + unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + auto I = MBB.end(); + + // We need to compute the offset relative to the instruction immediately after + // s_getpc_b64. Insert pc arithmetic code before last terminator. + MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + + // TODO: Handle > 32-bit block address. + if (BrOffset >= 0) { + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } else { + // Backwards branch. + BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } + + // Insert the indirect branch after the other terminator. + BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) + .addReg(PCReg); + + // FIXME: If spilling is necessary, this will fail because this scavenger has + // no emergency stack slots. It is non-trivial to spill in this situation, + // because the restore code needs to be specially placed after the + // jump. BranchRelaxation then needs to be made aware of the newly inserted + // block. + // + // If a spill is needed for the pc register pair, we need to insert a spill + // restore block right before the destination block, and insert a short branch + // into the old destination block's fallthrough predecessor. + // e.g.: + // + // s_cbranch_scc0 skip_long_branch: + // + // long_branch_bb: + // spill s[8:9] + // s_getpc_b64 s[8:9] + // s_add_u32 s8, s8, restore_bb + // s_addc_u32 s9, s9, 0 + // s_setpc_b64 s[8:9] + // + // skip_long_branch: + // foo; + // + // ..... + // + // dest_bb_fallthrough_predecessor: + // bar; + // s_branch dest_bb + // + // restore_bb: + // restore s[8:9] + // fallthrough dest_bb + /// + // dest_bb: + // buzz; + + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), 0); + MRI.replaceRegWith(PCReg, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + + return 4 + 8 + 4 + 4; +} + unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: @@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { } } -bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - - if (I == MBB.end()) - return false; - +bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { if (I->getOpcode() == AMDGPU::S_BRANCH) { // Unconditional Branch TBB = I->getOperand(0).getMBB(); @@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, return true; } +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I == MBB.end()) + return false; + + if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) + return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); + + ++I; + + // TODO: Should be able to treat as fallthrough? + if (I == MBB.end()) + return true; + + if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) + return true; + + MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); + + // Specifically handle the case where the conditional branch is to the same + // destination as the mask branch. e.g. + // + // si_mask_branch BB8 + // s_cbranch_execz BB8 + // s_cbranch BB9 + // + // This is required to understand divergent loops which may need the branches + // to be relaxed. + if (TBB != MaskBrDest || Cond.empty()) + return true; + + auto Pred = Cond[0].getImm(); + return (Pred != EXECZ && Pred != EXECNZ); +} + unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); @@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); + if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + I = Next; + continue; + } + RemovedSize += getInstSizeInBytes(*I); I->eraseFromParent(); ++Count; |