BranchRelaxation: Support expanding unconditional branches

AMDGPU needs to expand unconditional branches in a new block with an indirect branch. llvm-svn: 283464
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-10-06 16:20:41 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-10-06 16:20:41 +0000
commit: 6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch)
tree: 255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib
parent: d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff)
download: bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz
bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip
10 files changed, 355 insertions, 22 deletions
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index 4f0dfaf874f..1d76831d04a 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
@@ -23,6 +24,7 @@ using namespace llvm;
 
 STATISTIC(NumSplit, "Number of basic blocks split");
 STATISTIC(NumConditionalRelaxed, "Number of conditional branches relaxed");
+STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed");
 
 #define BRANCH_RELAX_NAME "Branch relaxation pass"
 
@@ -66,17 +68,22 @@ class BranchRelaxation : public MachineFunctionPass {
   };
 
   SmallVector<BasicBlockInfo, 16> BlockInfo;
+  std::unique_ptr<RegScavenger> RS;
 
   MachineFunction *MF;
   const TargetInstrInfo *TII;
 
   bool relaxBranchInstructions();
   void scanFunction();
+
+  MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &BB);
+
   MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
   void adjustBlockOffsets(MachineBasicBlock &MBB);
   bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const;
 
   bool fixupConditionalBranch(MachineInstr &MI);
+  bool fixupUnconditionalBranch(MachineInstr &MI);
   uint64_t computeBlockSize(const MachineBasicBlock &MBB) const;
   unsigned getInstrOffset(const MachineInstr &MI) const;
   void dumpBBs();
@@ -182,6 +189,19 @@ void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
   }
 }
 
+  /// Insert a new empty basic block and insert it after \BB
+MachineBasicBlock *BranchRelaxation::createNewBlockAfter(MachineBasicBlock &BB) {
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(BB.getBasicBlock());
+  MF->insert(++BB.getIterator(), NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  return NewBB;
+}
+
 /// Split the basic block containing MI into two blocks, which are joined by
 /// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
@@ -333,16 +353,55 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
   return true;
 }
 
+bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+
+  unsigned OldBrSize = TII->getInstSizeInBytes(MI);
+  MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+
+  int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+  int64_t SrcOffset = getInstrOffset(MI);
+
+  assert(!TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - SrcOffset));
+
+  BlockInfo[MBB->getNumber()].Size -= OldBrSize;
+
+  MachineBasicBlock *BranchBB = MBB;
+
+  // If this was an expanded conditional branch, there is already a single
+  // unconditional branch in a block.
+  if (!MBB->empty()) {
+    BranchBB = createNewBlockAfter(*MBB);
+
+    // Add live outs.
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      for (const MachineBasicBlock::RegisterMaskPair &LiveIn : Succ->liveins())
+        BranchBB->addLiveIn(LiveIn);
+    }
+
+    BranchBB->addSuccessor(DestBB);
+    MBB->replaceSuccessor(DestBB, BranchBB);
+  }
+
+  DebugLoc DL = MI.getDebugLoc();
+  MI.eraseFromParent();
+
+  // insertUnconditonalBranch may have inserted a new block.
+  BlockInfo[MBB->getNumber()].Size += TII->insertIndirectBranch(
+    *BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get());
+
+  computeBlockSize(*BranchBB);
+  adjustBlockOffsets(*MBB);
+  return true;
+}
+
 bool BranchRelaxation::relaxBranchInstructions() {
   bool Changed = false;
+
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
   for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
     MachineBasicBlock &MBB = *I;
-    MachineBasicBlock::iterator J = MBB.getFirstTerminator();
-    if (J == MBB.end())
-      continue;
-
 
     MachineBasicBlock::iterator Next;
     for (MachineBasicBlock::iterator J = MBB.getFirstTerminator();
@@ -377,6 +436,21 @@ bool BranchRelaxation::relaxBranchInstructions() {
           Next = MBB.getFirstTerminator();
         }
       }
+
+      if (MI.isUnconditionalBranch()) {
+        // Unconditional branch destination might be unanalyzable, assume these
+        // are OK.
+        if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI)) {
+          if (!isBlockInRange(MI, *DestBB)) {
+            fixupUnconditionalBranch(MI);
+            ++NumUnconditionalRelaxed;
+            Changed = true;
+          }
+        }
+
+        // Unconditional branch is the last terminator.
+        break;
+      }
     }
   }
 
@@ -388,7 +462,12 @@ bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
 
   DEBUG(dbgs() << "***** BranchRelaxation *****\n");
 
-  TII = MF->getSubtarget().getInstrInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  if (TRI->trackLivenessAfterRegAlloc(*MF))
+    RS.reset(new RegScavenger());
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index ef7321402da..ef20047377f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -120,6 +120,21 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   emitStartOfRuntimeMetadata(M);
 }
 
+bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
+  const MachineBasicBlock *MBB) const {
+  if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+    return false;
+
+  if (MBB->empty())
+    return true;
+
+  // If this is a block implementing a long branch, an expression relative to
+  // the start of the block is needed.  to the start of the block.
+  // XXX - Is there a smarter way to check this?
+  return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
+}
+
+
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f4e62ca0df2..3964d43a738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -131,6 +131,9 @@ public:
 
   void EmitStartOfAsmFile(Module &M) override;
 
+  bool isBlockOnlyReachableByFallthrough(
+    const MachineBasicBlock *MBB) const override;
+
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 757a9fc1835..5697a5f09ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -47,6 +47,27 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
   }
 }
 
+const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
+  const MachineBasicBlock &SrcBB,
+  const MachineOperand &MO) const {
+  const MCExpr *DestBBSym
+    = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
+  const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
+
+  assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
+         ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+
+  // s_getpc_b64 returns the address of next instruction.
+  const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
+  SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
+
+  if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+    return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
+
+  assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+  return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
+}
+
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
   int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
@@ -71,8 +92,14 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
-                                   MO.getMBB()->getSymbol(), Ctx));
+      if (MO.getTargetFlags() != 0) {
+        MCOp = MCOperand::createExpr(
+          getLongBranchBlockExpr(*MI->getParent(), MO));
+      } else {
+        MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      }
+
       break;
     case MachineOperand::MO_GlobalAddress: {
       const GlobalValue *GV = MO.getGlobal();
@@ -93,6 +120,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createExpr(Expr);
       break;
     }
+    case MachineOperand::MO_MCSymbol:
+      MCOp = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMCSymbol(), Ctx));
+      break;
     }
     OutMI.addOperand(MCOp);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 2ea58e496cb..8cedda6a706 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -14,8 +14,11 @@ namespace llvm {
 
 class AMDGPUSubtarget;
 class AsmPrinter;
+class MachineBasicBlock;
 class MachineInstr;
+class MachineOperand;
 class MCContext;
+class MCExpr;
 class MCInst;
 
 class AMDGPUMCInstLower {
@@ -23,6 +26,9 @@ class AMDGPUMCInstLower {
   const AMDGPUSubtarget &ST;
   const AsmPrinter &AP;
 
+  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+                                       const MachineOperand &MO) const;
+
 public:
   AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
                     const AsmPrinter &AP);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a6232ad8128..bc436d7ae42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -602,6 +602,7 @@ void GCNPassConfig::addPreEmitPass() {
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
+  addPass(&BranchRelaxationPassID);
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 887b073acd5..51c30d35508 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -38,12 +38,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
                                              const MCValue &Target,
                                              const MCFixup &Fixup,
                                              bool IsPCRel) const {
-  // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
-  // the scratch buffer.
-  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
-    return ELF::R_AMDGPU_ABS32_LO;
-  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
-    return ELF::R_AMDGPU_ABS32_HI;
+  if (const auto *SymA = Target.getSymA()) {
+    // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+    // the scratch buffer.
+    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+      return ELF::R_AMDGPU_ABS32_LO;
+
+    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+      return ELF::R_AMDGPU_ABS32_HI;
+  }
 
   switch (Target.getAccessVariant()) {
   default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index da725dae47c..ce41d82bbbf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -28,6 +28,13 @@
 
 using namespace llvm;
 
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+                 cl::desc("Restrict range of branch instructions (DEBUG)"));
+
 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
   : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
@@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                        int64_t BrOffset) const {
+  // BranchRelaxation should never have to check s_setpc_b64 because its dest
+  // block is unanalyzable.
+  assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+  // Convert to dwords.
+  BrOffset /= 4;
+
+  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+  // from the next instruction.
+  BrOffset -= 1;
+
+  return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+  const MachineInstr &MI) const {
+  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+    // This would be a difficult analysis to perform, but can always be legal so
+    // there's no need to analyze it.
+    return nullptr;
+  }
+
+  return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                           MachineBasicBlock &DestBB,
+                                           const DebugLoc &DL,
+                                           int64_t BrOffset,
+                                           RegScavenger *RS) const {
+  assert(RS && "RegScavenger required for long branching");
+  assert(MBB.empty() &&
+         "new block should be inserted for expanding unconditional branch");
+  assert(MBB.pred_size() == 1);
+
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // FIXME: Virtual register workaround for RegScavenger not working with empty
+  // blocks.
+  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  auto I = MBB.end();
+
+  // We need to compute the offset relative to the instruction immediately after
+  // s_getpc_b64. Insert pc arithmetic code before last terminator.
+  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+  // TODO: Handle > 32-bit block address.
+  if (BrOffset >= 0) {
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  } else {
+    // Backwards branch.
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  }
+
+  // Insert the indirect branch after the other terminator.
+  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+    .addReg(PCReg);
+
+  // FIXME: If spilling is necessary, this will fail because this scavenger has
+  // no emergency stack slots. It is non-trivial to spill in this situation,
+  // because the restore code needs to be specially placed after the
+  // jump. BranchRelaxation then needs to be made aware of the newly inserted
+  // block.
+  //
+  // If a spill is needed for the pc register pair, we need to insert a spill
+  // restore block right before the destination block, and insert a short branch
+  // into the old destination block's fallthrough predecessor.
+  // e.g.:
+  //
+  // s_cbranch_scc0 skip_long_branch:
+  //
+  // long_branch_bb:
+  //   spill s[8:9]
+  //   s_getpc_b64 s[8:9]
+  //   s_add_u32 s8, s8, restore_bb
+  //   s_addc_u32 s9, s9, 0
+  //   s_setpc_b64 s[8:9]
+  //
+  // skip_long_branch:
+  //   foo;
+  //
+  // .....
+  //
+  // dest_bb_fallthrough_predecessor:
+  // bar;
+  // s_branch dest_bb
+  //
+  // restore_bb:
+  //  restore s[8:9]
+  //  fallthrough dest_bb
+  ///
+  // dest_bb:
+  //   buzz;
+
+  RS->enterBasicBlockEnd(MBB);
+  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+                                       MachineBasicBlock::iterator(GetPC), 0);
+  MRI.replaceRegWith(PCReg, Scav);
+  MRI.clearVirtRegs();
+  RS->setRegUsed(Scav);
+
+  return 4 + 8 + 4 + 4;
+}
+
 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
   switch (Cond) {
   case SIInstrInfo::SCC_TRUE:
@@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
   }
 }
 
-bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
-  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-
-  if (I == MBB.end())
-    return false;
-
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
   if (I->getOpcode() == AMDGPU::S_BRANCH) {
     // Unconditional Branch
     TBB = I->getOperand(0).getMBB();
@@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   return true;
 }
 
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+    return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+  ++I;
+
+  // TODO: Should be able to treat as fallthrough?
+  if (I == MBB.end())
+    return true;
+
+  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+    return true;
+
+  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+  // Specifically handle the case where the conditional branch is to the same
+  // destination as the mask branch. e.g.
+  //
+  // si_mask_branch BB8
+  // s_cbranch_execz BB8
+  // s_cbranch BB9
+  //
+  // This is required to understand divergent loops which may need the branches
+  // to be relaxed.
+  if (TBB != MaskBrDest || Cond.empty())
+    return true;
+
+  auto Pred = Cond[0].getImm();
+  return (Pred != EXECZ && Pred != EXECNZ);
+}
+
 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                    int *BytesRemoved) const {
   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
@@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
   unsigned RemovedSize = 0;
   while (I != MBB.end()) {
     MachineBasicBlock::iterator Next = std::next(I);
+    if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      I = Next;
+      continue;
+    }
+
     RemovedSize += getInstSizeInBytes(*I);
     I->eraseFromParent();
     ++Count;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 11fe8c90396..b08801cd2c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -158,6 +158,24 @@ public:
   bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
+
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+                                MachineBasicBlock &NewDestBB,
+                                const DebugLoc &DL,
+                                int64_t BrOffset,
+                                RegScavenger *RS = nullptr) const override;
+
+  bool analyzeBranchImpl(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         bool AllowModify) const;
+
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -618,6 +636,12 @@ namespace AMDGPU {
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
   const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
+
+  // For MachineOperands.
+  enum TargetFlags {
+    TF_LONG_BRANCH_FORWARD = 1 << 0,
+    TF_LONG_BRANCH_BACKWARD = 1 << 1
+  };
 } // End namespace AMDGPU
 
 namespace SI {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d31002b999a..404ee4260aa 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -25,6 +25,7 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
   let SALU = 1;
   let SOP1 = 1;
   let SchedRW = [WriteSALU];
+  let Size = 4;
   let UseNamedOperandTable = 1;
 
   string Mnemonic = opName;
@@ -41,6 +42,7 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let Size = 4;
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-10-06 16:20:41 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-10-06 16:20:41 +0000
commit	6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch)
tree	255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib
parent	d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff)
download	bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip