4 files changed, 396 insertions, 4 deletions
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 3cf0c60108e..50b469d6d93 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -80,6 +80,7 @@ add_llvm_library(LLVMCodeGen
   MachineInstr.cpp
   MachineLICM.cpp
   MachineLoopInfo.cpp
+  MachineLoopUtils.cpp
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
   MachineOperand.cpp
diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp
new file mode 100644
index 00000000000..e074b76082f
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp
@@ -0,0 +1,132 @@
+//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+// MI's parent and BB are clones of each other. Find the equivalent copy of MI
+// in BB.
+MachineInstr &findEquivalentInstruction(MachineInstr &MI,
+                                        MachineBasicBlock *BB) {
+  MachineBasicBlock *PB = MI.getParent();
+  unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI));
+  return *std::next(BB->instr_begin(), Offset);
+}
+} // namespace
+
+MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
+                                             MachineBasicBlock *Loop,
+                                             MachineRegisterInfo &MRI,
+                                             const TargetInstrInfo *TII) {
+  MachineFunction &MF = *Loop->getParent();
+  MachineBasicBlock *Preheader = *Loop->pred_begin();
+  if (Preheader == Loop)
+    Preheader = *std::next(Loop->pred_begin());
+  MachineBasicBlock *Exit = *Loop->succ_begin();
+  if (Exit == Loop)
+    Exit = *std::next(Loop->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock());
+  if (Direction == LPD_Front)
+    MF.insert(Loop->getIterator(), NewBB);
+  else
+    MF.insert(std::next(Loop->getIterator()), NewBB);
+
+  // FIXME: Add DenseMapInfo trait for Register so we can use it as a key.
+  DenseMap<unsigned, Register> Remaps;
+  auto InsertPt = NewBB->end();
+  for (MachineInstr &MI : *Loop) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    NewBB->insert(InsertPt, NewMI);
+    for (MachineOperand &MO : NewMI->defs()) {
+      Register OrigR = MO.getReg();
+      if (OrigR.isPhysical())
+        continue;
+      Register &R = Remaps[OrigR];
+      R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+      MO.setReg(R);
+
+      if (Direction == LPD_Back) {
+        // Replace all uses outside the original loop with the new register.
+        // FIXME: is the use_iterator stable enough to mutate register uses
+        // while iterating?
+        SmallVector<MachineOperand *, 4> Uses;
+        for (auto &Use : MRI.use_operands(OrigR))
+          if (Use.getParent()->getParent() != Loop)
+            Uses.push_back(&Use);
+        for (auto *Use : Uses) {
+          MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          Use->setReg(R);
+        }
+      }
+    }
+  }
+
+  for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+
+  for (auto I = NewBB->begin(); I->isPHI(); ++I) {
+    MachineInstr &MI = *I;
+    unsigned LoopRegIdx = 3, InitRegIdx = 1;
+    if (MI.getOperand(2).getMBB() != Preheader)
+      std::swap(LoopRegIdx, InitRegIdx);
+    MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop);
+    assert(OrigPhi.isPHI());
+    if (Direction == LPD_Front) {
+      // When peeling front, we are only left with the initial value from the
+      // preheader.
+      Register R = MI.getOperand(LoopRegIdx).getReg();
+      if (Remaps.count(R))
+        R = Remaps[R];
+      OrigPhi.getOperand(InitRegIdx).setReg(R);
+      MI.RemoveOperand(LoopRegIdx + 1);
+      MI.RemoveOperand(LoopRegIdx + 0);
+    } else {
+      // When peeling back, the initial value is the loop-carried value from
+      // the original loop.
+      Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
+      MI.getOperand(LoopRegIdx).setReg(LoopReg);
+      MI.RemoveOperand(InitRegIdx + 1);
+      MI.RemoveOperand(InitRegIdx + 0);
+    }
+  }
+
+  DebugLoc DL;
+  if (Direction == LPD_Front) {
+    Preheader->replaceSuccessor(Loop, NewBB);
+    NewBB->addSuccessor(Loop);
+    Loop->replacePhiUsesWith(Preheader, NewBB);
+    if (TII->removeBranch(*Preheader) > 0)
+      TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL);
+    TII->removeBranch(*NewBB);
+    TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
+  } else {
+    Loop->replaceSuccessor(Exit, NewBB);
+    Exit->replacePhiUsesWith(Loop, NewBB);
+    NewBB->addSuccessor(Exit);
+
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+    (void)CanAnalyzeBr;
+    assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+    TII->removeBranch(*Loop);
+    TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB,
+                      FBB == Exit ? NewBB : FBB, Cond, DL);
+    if (TII->removeBranch(*NewBB) > 0)
+      TII->insertBranch(*NewBB, Exit, nullptr, {}, DL);
+  }
+
+  return NewBB;
+}
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 9591211fd9e..89c9f6093a9 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -557,10 +557,7 @@ void SwingSchedulerDAG::schedule() {
   // The experimental code generator can't work if there are InstChanges.
   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
-    // Experimental code generation isn't complete yet, but it can partially
-    // validate the code it generates against the original
-    // ModuloScheduleExpander.
-    MSE.validateAgainstModuloScheduleExpander();
+    MSE.expand();
   } else {
     ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
     MSE.expand();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index a68153cf3b6..30aa81487c8 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -1564,6 +1565,266 @@ private:
 };
 } // namespace
 
+MachineBasicBlock *
+PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
+  MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
+  if (LPD == LPD_Front)
+    PeeledFront.push_back(NewBB);
+  else
+    PeeledBack.push_front(NewBB);
+  for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
+       ++I, ++NI) {
+    CanonicalMIs[&*I] = &*I;
+    CanonicalMIs[&*NI] = &*I;
+    BlockMIs[{NewBB, &*I}] = &*NI;
+    BlockMIs[{BB, &*I}] = &*I;
+  }
+  return NewBB;
+}
+
+void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
+  BitVector LS(Schedule.getNumStages(), true);
+  BitVector AS(Schedule.getNumStages(), true);
+  LiveStages[BB] = LS;
+  AvailableStages[BB] = AS;
+
+  // Peel out the prologs.
+  LS.reset();
+  for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
+    LS[I] = 1;
+    Prologs.push_back(peelKernel(LPD_Front));
+    LiveStages[Prologs.back()] = LS;
+    AvailableStages[Prologs.back()] = LS;
+  }
+
+  // Create a block that will end up as the new loop exiting block (dominated by
+  // all prologs and epilogs). It will only contain PHIs, in the same order as
+  // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
+  // that the exiting block is a (sub) clone of BB. This in turn gives us the
+  // property that any value deffed in BB but used outside of BB is used by a
+  // PHI in the exiting block.
+  MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
+
+  // Push out the epilogs, again in reverse order.
+  // We can't assume anything about the minumum loop trip count at this point,
+  // so emit a fairly complex epilog:
+  //  K[0, 1, 2]     // Kernel runs stages 0, 1, 2
+  //  E0[2] <- P1    // Epilog runs stage 2 only, so the state after is [0].
+  //  E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
+  //
+  // This creates a single-successor single-predecessor sequence of blocks for
+  // each epilog, which are kept this way for simplicity at this stage and
+  // cleaned up by the optimizer later.
+  for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
+    Epilogs.push_back(nullptr);
+    for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
+      LS.reset();
+      LS[J] = 1;
+      Epilogs.back() = peelKernel(LPD_Back);
+      LiveStages[Epilogs.back()] = LS;
+      AvailableStages[Epilogs.back()] = AS;
+    }
+  }
+
+  // Now we've defined all the prolog and epilog blocks as a fallthrough
+  // sequence, add the edges that will be followed if the loop trip count is
+  // lower than the number of stages (connecting prologs directly with epilogs).
+  auto PI = Prologs.begin();
+  auto EI = Epilogs.begin();
+  assert(Prologs.size() == Epilogs.size());
+  for (; PI != Prologs.end(); ++PI, ++EI) {
+    MachineBasicBlock *Pred = *(*EI)->pred_begin();
+    (*PI)->addSuccessor(*EI);
+    for (MachineInstr &MI : (*EI)->phis()) {
+      Register Reg = MI.getOperand(1).getReg();
+      MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
+      if (Use && Use->getParent() == Pred)
+        Reg = getEquivalentRegisterIn(Reg, *PI);
+      MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+      MI.addOperand(MachineOperand::CreateMBB(*PI));
+    }
+  }
+
+  // Create a list of all blocks in order.
+  SmallVector<MachineBasicBlock *, 8> Blocks;
+  llvm::copy(PeeledFront, std::back_inserter(Blocks));
+  Blocks.push_back(BB);
+  llvm::copy(PeeledBack, std::back_inserter(Blocks));
+
+  // Iterate in reverse order over all instructions, remapping as we go.
+  for (MachineBasicBlock *B : reverse(Blocks)) {
+    for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
+         I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
+      MachineInstr *MI = &*I++;
+      rewriteUsesOf(MI);
+    }
+  }
+  // Now all remapping has been done, we're free to optimize the generated code.
+  for (MachineBasicBlock *B : reverse(Blocks))
+    EliminateDeadPhis(B, MRI, LIS);
+  EliminateDeadPhis(ExitingBB, MRI, LIS);
+}
+
+MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
+  MachineFunction &MF = *BB->getParent();
+  MachineBasicBlock *Exit = *BB->succ_begin();
+  if (Exit == BB)
+    Exit = *std::next(BB->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  MF.insert(std::next(BB->getIterator()), NewBB);
+
+  // Clone all phis in BB into NewBB and rewrite.
+  for (MachineInstr &MI : BB->phis()) {
+    auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
+    Register OldR = MI.getOperand(3).getReg();
+    Register R = MRI.createVirtualRegister(RC);
+    SmallVector<MachineInstr *, 4> Uses;
+    for (MachineInstr &Use : MRI.use_instructions(OldR))
+      if (Use.getParent() != BB)
+        Uses.push_back(&Use);
+    for (MachineInstr *Use : Uses)
+      Use->substituteRegister(OldR, R, /*SubIdx=*/0,
+                              *MRI.getTargetRegisterInfo());
+    MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(OldR)
+        .addMBB(BB);
+    BlockMIs[{NewBB, &MI}] = NI;
+    CanonicalMIs[NI] = &MI;
+  }
+  BB->replaceSuccessor(Exit, NewBB);
+  Exit->replacePhiUsesWith(BB, NewBB);
+  NewBB->addSuccessor(Exit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
+  (void)CanAnalyzeBr;
+  assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+  TII->removeBranch(*BB);
+  TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
+                    Cond, DebugLoc());
+  TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
+  return NewBB;
+}
+
+Register
+PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
+                                                       MachineBasicBlock *BB) {
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
+  return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
+}
+
+void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
+  if (MI->isPHI()) {
+    // This is an illegal PHI. The loop-carried (desired) value is operand 3,
+    // and it is produced by this block.
+    Register PhiR = MI->getOperand(0).getReg();
+    Register R = MI->getOperand(3).getReg();
+    int RMIStage = getStage(MRI.getUniqueVRegDef(R));
+    if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
+      R = MI->getOperand(1).getReg();
+    MRI.setRegClass(R, MRI.getRegClass(PhiR));
+    MRI.replaceRegWith(PhiR, R);
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  int Stage = getStage(MI);
+  if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
+      LiveStages[MI->getParent()].test(Stage))
+    // Instruction is live, no rewriting to do.
+    return;
+
+  for (MachineOperand &DefMO : MI->defs()) {
+    SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+    for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+      // Only PHIs can use values from this block by construction.
+      // Match with the equivalent PHI in B.
+      assert(UseMI.isPHI());
+      Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                             MI->getParent());
+      Subs.emplace_back(&UseMI, Reg);
+    }
+    for (auto &Sub : Subs)
+      Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                    *MRI.getTargetRegisterInfo());
+  }
+  if (LIS)
+    LIS->RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+}
+
+void PeelingModuloScheduleExpander::fixupBranches() {
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> Info =
+      TII->analyzeLoopForPipelining(BB);
+  assert(Info);
+
+  // Work outwards from the kernel.
+  bool KernelDisposed = false;
+  int TC = Schedule.getNumStages() - 1;
+  for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
+       ++PI, ++EI, --TC) {
+    MachineBasicBlock *Prolog = *PI;
+    MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
+    MachineBasicBlock *Epilog = *EI;
+    SmallVector<MachineOperand, 4> Cond;
+    Optional<bool> StaticallyGreater =
+        Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
+    if (!StaticallyGreater.hasValue()) {
+      LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
+      // Dynamically branch based on Cond.
+      TII->removeBranch(*Prolog);
+      TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
+      // Prolog never falls through; branch to epilog and orphan interior
+      // blocks. Leave it to unreachable-block-elim to clean up.
+      Prolog->removeSuccessor(Fallthrough);
+      for (MachineInstr &P : Fallthrough->phis()) {
+        P.RemoveOperand(2);
+        P.RemoveOperand(1);
+      }
+      TII->removeBranch(*Prolog);
+      TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
+      KernelDisposed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
+      // Prolog always falls through; remove incoming values in epilog.
+      Prolog->removeSuccessor(Epilog);
+      for (MachineInstr &P : Epilog->phis()) {
+        P.RemoveOperand(4);
+        P.RemoveOperand(3);
+      }
+    }
+  }
+
+  if (!KernelDisposed) {
+    Info->adjustTripCount(-(Schedule.getNumStages() - 1));
+    Info->setPreheader(Prologs.back());
+  } else {
+    Info->disposed();
+  }
+}
+
+void PeelingModuloScheduleExpander::rewriteKernel() {
+  KernelRewriter KR(*Schedule.getLoop(), Schedule);
+  KR.rewrite();
+}
+
+void PeelingModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+  LLVM_DEBUG(Schedule.dump());
+
+  rewriteKernel();
+  peelPrologAndEpilogs();
+  fixupBranches();
+}
+
 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
   BB = Schedule.getLoop()->getTopBlock();
   Preheader = Schedule.getLoop()->getLoopPreheader();
@@ -1593,6 +1854,7 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
   // Now run the new expansion algorithm.
   KernelRewriter KR(*Schedule.getLoop(), Schedule);
   KR.rewrite();
+  peelPrologAndEpilogs();
 
   // Collect all illegal phis that the new algorithm created. We'll give these
   // to KernelOperandInfo.