BranchRelaxation: Support expanding unconditional branches

AMDGPU needs to expand unconditional branches in a new block with an indirect branch. llvm-svn: 283464
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-10-06 16:20:41 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-10-06 16:20:41 +0000
commit: 6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch)
tree: 255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent: d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff)
download: bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz
bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip
1 files changed, 178 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index da725dae47c..ce41d82bbbf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -28,6 +28,13 @@
 
 using namespace llvm;
 
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+                 cl::desc("Restrict range of branch instructions (DEBUG)"));
+
 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
   : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
@@ -1045,6 +1052,128 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                        int64_t BrOffset) const {
+  // BranchRelaxation should never have to check s_setpc_b64 because its dest
+  // block is unanalyzable.
+  assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+  // Convert to dwords.
+  BrOffset /= 4;
+
+  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+  // from the next instruction.
+  BrOffset -= 1;
+
+  return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+  const MachineInstr &MI) const {
+  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+    // This would be a difficult analysis to perform, but can always be legal so
+    // there's no need to analyze it.
+    return nullptr;
+  }
+
+  return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                           MachineBasicBlock &DestBB,
+                                           const DebugLoc &DL,
+                                           int64_t BrOffset,
+                                           RegScavenger *RS) const {
+  assert(RS && "RegScavenger required for long branching");
+  assert(MBB.empty() &&
+         "new block should be inserted for expanding unconditional branch");
+  assert(MBB.pred_size() == 1);
+
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // FIXME: Virtual register workaround for RegScavenger not working with empty
+  // blocks.
+  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  auto I = MBB.end();
+
+  // We need to compute the offset relative to the instruction immediately after
+  // s_getpc_b64. Insert pc arithmetic code before last terminator.
+  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+  // TODO: Handle > 32-bit block address.
+  if (BrOffset >= 0) {
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  } else {
+    // Backwards branch.
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  }
+
+  // Insert the indirect branch after the other terminator.
+  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+    .addReg(PCReg);
+
+  // FIXME: If spilling is necessary, this will fail because this scavenger has
+  // no emergency stack slots. It is non-trivial to spill in this situation,
+  // because the restore code needs to be specially placed after the
+  // jump. BranchRelaxation then needs to be made aware of the newly inserted
+  // block.
+  //
+  // If a spill is needed for the pc register pair, we need to insert a spill
+  // restore block right before the destination block, and insert a short branch
+  // into the old destination block's fallthrough predecessor.
+  // e.g.:
+  //
+  // s_cbranch_scc0 skip_long_branch:
+  //
+  // long_branch_bb:
+  //   spill s[8:9]
+  //   s_getpc_b64 s[8:9]
+  //   s_add_u32 s8, s8, restore_bb
+  //   s_addc_u32 s9, s9, 0
+  //   s_setpc_b64 s[8:9]
+  //
+  // skip_long_branch:
+  //   foo;
+  //
+  // .....
+  //
+  // dest_bb_fallthrough_predecessor:
+  // bar;
+  // s_branch dest_bb
+  //
+  // restore_bb:
+  //  restore s[8:9]
+  //  fallthrough dest_bb
+  ///
+  // dest_bb:
+  //   buzz;
+
+  RS->enterBasicBlockEnd(MBB);
+  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+                                       MachineBasicBlock::iterator(GetPC), 0);
+  MRI.replaceRegWith(PCReg, Scav);
+  MRI.clearVirtRegs();
+  RS->setRegUsed(Scav);
+
+  return 4 + 8 + 4 + 4;
+}
+
 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
   switch (Cond) {
   case SIInstrInfo::SCC_TRUE:
@@ -1083,15 +1212,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
   }
 }
 
-bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
-  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-
-  if (I == MBB.end())
-    return false;
-
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
   if (I->getOpcode() == AMDGPU::S_BRANCH) {
     // Unconditional Branch
     TBB = I->getOperand(0).getMBB();
@@ -1122,6 +1248,44 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   return true;
 }
 
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+    return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+  ++I;
+
+  // TODO: Should be able to treat as fallthrough?
+  if (I == MBB.end())
+    return true;
+
+  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+    return true;
+
+  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+  // Specifically handle the case where the conditional branch is to the same
+  // destination as the mask branch. e.g.
+  //
+  // si_mask_branch BB8
+  // s_cbranch_execz BB8
+  // s_cbranch BB9
+  //
+  // This is required to understand divergent loops which may need the branches
+  // to be relaxed.
+  if (TBB != MaskBrDest || Cond.empty())
+    return true;
+
+  auto Pred = Cond[0].getImm();
+  return (Pred != EXECZ && Pred != EXECNZ);
+}
+
 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                    int *BytesRemoved) const {
   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
@@ -1130,6 +1294,11 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
   unsigned RemovedSize = 0;
   while (I != MBB.end()) {
     MachineBasicBlock::iterator Next = std::next(I);
+    if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      I = Next;
+      continue;
+    }
+
     RemovedSize += getInstSizeInBytes(*I);
     I->eraseFromParent();
     ++Count;
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-10-06 16:20:41 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-10-06 16:20:41 +0000
commit	6bc43d8627ca44465e7ce261a0828b70d3460e13 (patch)
tree	255944724c060d2eef50f0ccf04df7546aca245c /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent	d391d6f1c32d7316cb7fa8cfa4e039f94133ccbe (diff)
download	bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.tar.gz bcm5719-llvm-6bc43d8627ca44465e7ce261a0828b70d3460e13.zip