diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-10-06 16:20:41 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-10-06 16:20:41 +0000 |
| commit | 6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch) | |
| tree | 255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib | |
| parent | d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff) | |
| download | bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip | |
BranchRelaxation: Support expanding unconditional branches
AMDGPU needs to expand unconditional branches in a new
block with an indirect branch.
llvm-svn: 283464
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/CodeGen/BranchRelaxation.cpp | 89 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 15 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 35 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 15 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 187 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 |
10 files changed, 355 insertions, 22 deletions
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp index 4f0dfaf874f..1d76831d04a 100644 --- a/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Support/Debug.h" @@ -23,6 +24,7 @@ using namespace llvm; STATISTIC(NumSplit, "Number of basic blocks split"); STATISTIC(NumConditionalRelaxed, "Number of conditional branches relaxed"); +STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed"); #define BRANCH_RELAX_NAME "Branch relaxation pass" @@ -66,17 +68,22 @@ class BranchRelaxation : public MachineFunctionPass { }; SmallVector<BasicBlockInfo, 16> BlockInfo; + std::unique_ptr<RegScavenger> RS; MachineFunction *MF; const TargetInstrInfo *TII; bool relaxBranchInstructions(); void scanFunction(); + + MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &BB); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI); void adjustBlockOffsets(MachineBasicBlock &MBB); bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const; bool fixupConditionalBranch(MachineInstr &MI); + bool fixupUnconditionalBranch(MachineInstr &MI); uint64_t computeBlockSize(const MachineBasicBlock &MBB) const; unsigned getInstrOffset(const MachineInstr &MI) const; void dumpBBs(); @@ -182,6 +189,19 @@ void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) { } } + /// Insert a new empty basic block and insert it after \BB +MachineBasicBlock *BranchRelaxation::createNewBlockAfter(MachineBasicBlock &BB) { + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(BB.getBasicBlock()); + MF->insert(++BB.getIterator(), NewBB); + + // Insert an entry into BlockInfo to align it properly with the block numbers. + BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + return NewBB; +} + /// Split the basic block containing MI into two blocks, which are joined by /// an unconditional branch. Update data structures and renumber blocks to /// account for this change and returns the newly created block. @@ -333,16 +353,55 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { return true; } +bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + + unsigned OldBrSize = TII->getInstSizeInBytes(MI); + MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); + + int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset; + int64_t SrcOffset = getInstrOffset(MI); + + assert(!TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - SrcOffset)); + + BlockInfo[MBB->getNumber()].Size -= OldBrSize; + + MachineBasicBlock *BranchBB = MBB; + + // If this was an expanded conditional branch, there is already a single + // unconditional branch in a block. + if (!MBB->empty()) { + BranchBB = createNewBlockAfter(*MBB); + + // Add live outs. + for (const MachineBasicBlock *Succ : MBB->successors()) { + for (const MachineBasicBlock::RegisterMaskPair &LiveIn : Succ->liveins()) + BranchBB->addLiveIn(LiveIn); + } + + BranchBB->addSuccessor(DestBB); + MBB->replaceSuccessor(DestBB, BranchBB); + } + + DebugLoc DL = MI.getDebugLoc(); + MI.eraseFromParent(); + + // insertUnconditonalBranch may have inserted a new block. + BlockInfo[MBB->getNumber()].Size += TII->insertIndirectBranch( + *BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get()); + + computeBlockSize(*BranchBB); + adjustBlockOffsets(*MBB); + return true; +} + bool BranchRelaxation::relaxBranchInstructions() { bool Changed = false; + // Relaxing branches involves creating new basic blocks, so re-eval // end() for termination. for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) { MachineBasicBlock &MBB = *I; - MachineBasicBlock::iterator J = MBB.getFirstTerminator(); - if (J == MBB.end()) - continue; - MachineBasicBlock::iterator Next; for (MachineBasicBlock::iterator J = MBB.getFirstTerminator(); @@ -377,6 +436,21 @@ bool BranchRelaxation::relaxBranchInstructions() { Next = MBB.getFirstTerminator(); } } + + if (MI.isUnconditionalBranch()) { + // Unconditional branch destination might be unanalyzable, assume these + // are OK. + if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI)) { + if (!isBlockInRange(MI, *DestBB)) { + fixupUnconditionalBranch(MI); + ++NumUnconditionalRelaxed; + Changed = true; + } + } + + // Unconditional branch is the last terminator. + break; + } } } @@ -388,7 +462,12 @@ bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "***** BranchRelaxation *****\n"); - TII = MF->getSubtarget().getInstrInfo(); + const TargetSubtargetInfo &ST = MF->getSubtarget(); + TII = ST.getInstrInfo(); + + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + if (TRI->trackLivenessAfterRegAlloc(*MF)) + RS.reset(new RegScavenger()); // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index ef7321402da..ef20047377f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -120,6 +120,21 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { emitStartOfRuntimeMetadata(M); } +bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const { + if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) + return false; + + if (MBB->empty()) + return true; + + // If this is a block implementing a long branch, an expression relative to + // the start of the block is needed. to the start of the block. + // XXX - Is there a smarter way to check this? + return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); +} + + void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index f4e62ca0df2..3964d43a738 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -131,6 +131,9 @@ public: void EmitStartOfAsmFile(Module &M) override; + bool isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 757a9fc1835..5697a5f09ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -47,6 +47,27 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { } } +const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( + const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const { + const MCExpr *DestBBSym + = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); + const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); + + assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && + ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + + // s_getpc_b64 returns the address of next instruction. + const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); + SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); + + if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD) + return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); + + assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD); + return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); @@ -71,8 +92,14 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); break; case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), Ctx)); + if (MO.getTargetFlags() != 0) { + MCOp = MCOperand::createExpr( + getLongBranchBlockExpr(*MI->getParent(), MO)); + } else { + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); + } + break; case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); @@ -93,6 +120,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createExpr(Expr); break; } + case MachineOperand::MO_MCSymbol: + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMCSymbol(), Ctx)); + break; } OutMI.addOperand(MCOp); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 2ea58e496cb..8cedda6a706 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -14,8 +14,11 @@ namespace llvm { class AMDGPUSubtarget; class AsmPrinter; +class MachineBasicBlock; class MachineInstr; +class MachineOperand; class MCContext; +class MCExpr; class MCInst; class AMDGPUMCInstLower { @@ -23,6 +26,9 @@ class AMDGPUMCInstLower { const AMDGPUSubtarget &ST; const AsmPrinter &AP; + const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const; + public: AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST, const AsmPrinter &AP); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a6232ad8128..bc436d7ae42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -602,6 +602,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); + addPass(&BranchRelaxationPassID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 887b073acd5..51c30d35508 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -38,12 +38,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // SCRATCH_RSRC_DWORD[01] is a special global variable that represents - // the scratch buffer. - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") - return ELF::R_AMDGPU_ABS32_LO; - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") - return ELF::R_AMDGPU_ABS32_HI; + if (const auto *SymA = Target.getSymA()) { + // SCRATCH_RSRC_DWORD[01] is a special global variable that represents + // the scratch buffer. + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + return ELF::R_AMDGPU_ABS32_LO; + + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + return ELF::R_AMDGPU_ABS32_HI; + } switch (Target.getAccessVariant()) { default: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index da725dae47c..ce41d82bbbf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -28,6 +28,13 @@ using namespace llvm; +// Must be at least 4 to be able to branch over minimum unconditional branch +// code. This is only for making it possible to write reasonably small tests for +// long branches. +static cl::opt<unsigned> +BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), + cl::desc("Restrict range of branch instructions (DEBUG)")); + SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} @@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + // BranchRelaxation should never have to check s_setpc_b64 because its dest + // block is unanalyzable. + assert(BranchOp != AMDGPU::S_SETPC_B64); + + // Convert to dwords. + BrOffset /= 4; + + // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is + // from the next instruction. + BrOffset -= 1; + + return isIntN(BranchOffsetBits, BrOffset); +} + +MachineBasicBlock *SIInstrInfo::getBranchDestBlock( + const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { + // This would be a difficult analysis to perform, but can always be legal so + // there's no need to analyze it. + return nullptr; + } + + return MI.getOperand(0).getMBB(); +} + +unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &DestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS) const { + assert(RS && "RegScavenger required for long branching"); + assert(MBB.empty() && + "new block should be inserted for expanding unconditional branch"); + assert(MBB.pred_size() == 1); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // FIXME: Virtual register workaround for RegScavenger not working with empty + // blocks. + unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + auto I = MBB.end(); + + // We need to compute the offset relative to the instruction immediately after + // s_getpc_b64. Insert pc arithmetic code before last terminator. + MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + + // TODO: Handle > 32-bit block address. + if (BrOffset >= 0) { + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } else { + // Backwards branch. + BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } + + // Insert the indirect branch after the other terminator. + BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) + .addReg(PCReg); + + // FIXME: If spilling is necessary, this will fail because this scavenger has + // no emergency stack slots. It is non-trivial to spill in this situation, + // because the restore code needs to be specially placed after the + // jump. BranchRelaxation then needs to be made aware of the newly inserted + // block. + // + // If a spill is needed for the pc register pair, we need to insert a spill + // restore block right before the destination block, and insert a short branch + // into the old destination block's fallthrough predecessor. + // e.g.: + // + // s_cbranch_scc0 skip_long_branch: + // + // long_branch_bb: + // spill s[8:9] + // s_getpc_b64 s[8:9] + // s_add_u32 s8, s8, restore_bb + // s_addc_u32 s9, s9, 0 + // s_setpc_b64 s[8:9] + // + // skip_long_branch: + // foo; + // + // ..... + // + // dest_bb_fallthrough_predecessor: + // bar; + // s_branch dest_bb + // + // restore_bb: + // restore s[8:9] + // fallthrough dest_bb + /// + // dest_bb: + // buzz; + + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), 0); + MRI.replaceRegWith(PCReg, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + + return 4 + 8 + 4 + 4; +} + unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: @@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { } } -bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - - if (I == MBB.end()) - return false; - +bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { if (I->getOpcode() == AMDGPU::S_BRANCH) { // Unconditional Branch TBB = I->getOperand(0).getMBB(); @@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, return true; } +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I == MBB.end()) + return false; + + if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) + return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); + + ++I; + + // TODO: Should be able to treat as fallthrough? + if (I == MBB.end()) + return true; + + if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) + return true; + + MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); + + // Specifically handle the case where the conditional branch is to the same + // destination as the mask branch. e.g. + // + // si_mask_branch BB8 + // s_cbranch_execz BB8 + // s_cbranch BB9 + // + // This is required to understand divergent loops which may need the branches + // to be relaxed. + if (TBB != MaskBrDest || Cond.empty()) + return true; + + auto Pred = Cond[0].getImm(); + return (Pred != EXECZ && Pred != EXECNZ); +} + unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); @@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); + if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + I = Next; + continue; + } + RemovedSize += getInstSizeInBytes(*I); I->eraseFromParent(); ++Count; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 11fe8c90396..b08801cd2c3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -158,6 +158,24 @@ public: bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool isBranchOffsetInRange(unsigned BranchOpc, + int64_t BrOffset) const override; + + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; + + unsigned insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &NewDestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS = nullptr) const override; + + bool analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, @@ -618,6 +636,12 @@ namespace AMDGPU { const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); + + // For MachineOperands. + enum TargetFlags { + TF_LONG_BRANCH_FORWARD = 1 << 0, + TF_LONG_BRANCH_BACKWARD = 1 << 1 + }; } // End namespace AMDGPU namespace SI { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index d31002b999a..404ee4260aa 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -25,6 +25,7 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, let SALU = 1; let SOP1 = 1; let SchedRW = [WriteSALU]; + let Size = 4; let UseNamedOperandTable = 1; string Mnemonic = opName; @@ -41,6 +42,7 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> : let isPseudo = 0; let isCodeGenOnly = 0; + let Size = 4; // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; |

