5 files changed, 431 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 451961c351e..6984f4e7161 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -219,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
 }
 
+static ScheduleDAGInstrs *
+createIterativeILPMachineScheduler(MachineSchedContext *C) {
+  auto DAG = new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_ILP);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  return DAG;
+}
+
 static MachineSchedRegistry
 R600SchedRegistry("r600", "Run R600's custom scheduler",
                    createR600MachineScheduler);
@@ -242,6 +252,11 @@ GCNMinRegSchedRegistry("gcn-minreg",
   "Run GCN iterative scheduler for minimal register usage (experimental)",
   createMinRegScheduler);
 
+static MachineSchedRegistry
+GCNILPSchedRegistry("gcn-ilp",
+  "Run GCN iterative scheduler for ILP scheduling (experimental)",
+  createIterativeILPMachineScheduler);
+
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index baefbd3ae05..3a850303041 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
+  GCNILPSched.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
new file mode 100644
index 00000000000..ba8211b189c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -0,0 +1,364 @@
+//===---------------------------- GCNILPSched.cpp - -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+class GCNILPScheduler {
+  struct Candidate : ilist_node<Candidate> {
+    SUnit *SU;
+
+    Candidate(SUnit *SU_)
+      : SU(SU_) {}
+  };
+
+  SpecificBumpPtrAllocator<Candidate> Alloc;
+  typedef simple_ilist<Candidate> Queue;
+  Queue PendingQueue;
+  Queue AvailQueue;
+  unsigned CurQueueId = 0;
+
+  std::vector<unsigned> SUNumbers;
+
+  /// CurCycle - The current scheduler state corresponds to this cycle.
+  unsigned CurCycle = 0;
+
+  unsigned getNodePriority(const SUnit *SU) const;
+
+  const SUnit *pickBest(const SUnit *left, const SUnit *right);
+  Candidate* pickCandidate();
+
+  void releasePending();
+  void advanceToCycle(unsigned NextCycle);
+  void releasePredecessors(const SUnit* SU);
+
+public:
+  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+                                     const ScheduleDAG &DAG);
+};
+} // namespace
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+  if (SethiUllmanNumber != 0)
+    return SethiUllmanNumber;
+
+  unsigned Extra = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
+    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+    if (PredSethiUllman > SethiUllmanNumber) {
+      SethiUllmanNumber = PredSethiUllman;
+      Extra = 0;
+    }
+    else if (PredSethiUllman == SethiUllmanNumber)
+      ++Extra;
+  }
+
+  SethiUllmanNumber += Extra;
+
+  if (SethiUllmanNumber == 0)
+    SethiUllmanNumber = 1;
+
+  return SethiUllmanNumber;
+}
+
+// Lower priority means schedule further down. For bottom-up scheduling, lower
+// priority SUs are scheduled before higher priority SUs.
+unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const {
+  assert(SU->NodeNum < SUNumbers.size());
+  if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+    // If SU does not have a register use, i.e. it doesn't produce a value
+    // that would be consumed (e.g. store), then it terminates a chain of
+    // computation.  Give it a large SethiUllman number so it will be
+    // scheduled right before its predecessors that it doesn't lengthen
+    // their live ranges.
+    return 0xffff;
+
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return 0;
+
+  return SUNumbers[SU->NodeNum];
+}
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;  // ignore chain succs
+    unsigned Height = Succ.getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+// Return -1 if left has higher priority, 1 if right has higher priority.
+// Return 0 if latency-based priority is equivalent.
+static int BUCompareLatency(const SUnit *left, const SUnit *right) {
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LHeight = (int)left->getHeight();
+  int RHeight = (int)right->getHeight();
+
+  // If either node is scheduling for latency, sort them by height/depth
+  // and latency.
+
+  // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+  // is enabled, grouping instructions by cycle, then its height is already
+  // covered so only its depth matters. We also reach this point if both stall
+  // but have the same height.
+  if (LHeight != RHeight)
+    return LHeight > RHeight ? 1 : -1;
+
+  int LDepth = left->getDepth();
+  int RDepth = right->getDepth();
+  if (LDepth != RDepth) {
+    DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+      << ") depth " << LDepth << " vs SU (" << right->NodeNum
+      << ") depth " << RDepth << "\n");
+    return LDepth < RDepth ? 1 : -1;
+  }
+  if (left->Latency != right->Latency)
+    return left->Latency > right->Latency ? 1 : -1;
+
+  return 0;
+}
+
+const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right)
+{
+  // TODO: add register pressure lowering checks
+
+  bool const DisableSchedCriticalPath = false;
+  int MaxReorderWindow = 6;
+  if (!DisableSchedCriticalPath) {
+    int spread = (int)left->getDepth() - (int)right->getDepth();
+    if (std::abs(spread) > MaxReorderWindow) {
+      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+        << left->getDepth() << " != SU(" << right->NodeNum << "): "
+        << right->getDepth() << "\n");
+      return left->getDepth() < right->getDepth() ? right : left;
+    }
+  }
+
+  bool const DisableSchedHeight = false;
+  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+    int spread = (int)left->getHeight() - (int)right->getHeight();
+    if (std::abs(spread) > MaxReorderWindow)
+      return left->getHeight() > right->getHeight() ? right : left;
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
+  unsigned LPriority = getNodePriority(left);
+  unsigned RPriority = getNodePriority(right);
+
+  if (LPriority != RPriority)
+    return LPriority > RPriority ? right : left;
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist ? right : left;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch ? right : left;
+
+  bool const DisableSchedCycles = false;
+  if (!DisableSchedCycles) {
+    int result = BUCompareLatency(left, right);
+    if (result != 0)
+      return result > 0 ? right : left;
+    return left;
+  }
+  else {
+    if (left->getHeight() != right->getHeight())
+      return (left->getHeight() > right->getHeight()) ? right : left;
+
+    if (left->getDepth() != right->getDepth())
+      return (left->getDepth() < right->getDepth()) ? right : left;
+  }
+
+  assert(left->NodeQueueId && right->NodeQueueId &&
+        "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId) ? right : left;
+}
+
+GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() {
+  if (AvailQueue.empty())
+    return nullptr;
+  auto Best = AvailQueue.begin();
+  for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) {
+    auto NewBestSU = pickBest(Best->SU, I->SU);
+    if (NewBestSU != Best->SU) {
+      assert(NewBestSU == I->SU);
+      Best = I;
+    }
+  }
+  return &*Best;
+}
+
+void GCNILPScheduler::releasePending() {
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) {
+    auto &C = *I++;
+    if (C.SU->getHeight() <= CurCycle) {
+      PendingQueue.remove(C);
+      AvailQueue.push_back(C);
+      C.SU->NodeQueueId = CurQueueId++;
+    }
+  }
+}
+
+/// Move the scheduler state forward by the specified number of Cycles.
+void GCNILPScheduler::advanceToCycle(unsigned NextCycle) {
+  if (NextCycle <= CurCycle)
+    return;
+  CurCycle = NextCycle;
+  releasePending();
+}
+
+void GCNILPScheduler::releasePredecessors(const SUnit* SU) {
+  for (const auto &PredEdge : SU->Preds) {
+    auto PredSU = PredEdge.getSUnit();
+    if (PredEdge.isWeak())
+      continue;
+    assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0);
+
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency());
+
+    if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0)
+      PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU));
+  }
+}
+
+std::vector<const SUnit*>
+GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
+                          const ScheduleDAG &DAG) {
+  auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits;
+
+  std::vector<SUnit> SUSavedCopy;
+  SUSavedCopy.resize(SUnits.size());
+
+  // we cannot save only those fields we touch: some of them are private
+  // so save units verbatim: this assumes SUnit should have value semantics
+  for (const SUnit &SU : SUnits)
+    SUSavedCopy[SU.NodeNum] = SU;
+
+  SUNumbers.assign(SUnits.size(), 0);
+  for (const SUnit &SU : SUnits)
+    CalcNodeSethiUllmanNumber(&SU, SUNumbers);
+
+  for (auto SU : BotRoots) {
+    AvailQueue.push_back(
+      *new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU)));
+  }
+  releasePredecessors(&DAG.ExitSU);
+
+  std::vector<const SUnit*> Schedule;
+  Schedule.reserve(SUnits.size());
+  while (true) {
+    if (AvailQueue.empty() && !PendingQueue.empty()) {
+      auto EarliestSU = std::min_element(
+        PendingQueue.begin(), PendingQueue.end(),
+        [=](const Candidate& C1, const Candidate& C2) {
+        return C1.SU->getHeight() < C2.SU->getHeight();
+      })->SU;
+      advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight()));
+    }
+    if (AvailQueue.empty())
+      break;
+
+    DEBUG(
+      dbgs() << "\n=== Picking candidate\n"
+                "Ready queue:";
+      for (auto &C : AvailQueue)
+        dbgs() << ' ' << C.SU->NodeNum;
+      dbgs() << '\n';
+    );
+
+    auto C = pickCandidate();
+    assert(C);
+    AvailQueue.remove(*C);
+    auto SU = C->SU;
+    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+    advanceToCycle(SU->getHeight());
+
+    releasePredecessors(SU);
+    Schedule.push_back(SU);
+    SU->isScheduled = true;
+  }
+  assert(SUnits.size() == Schedule.size());
+
+  std::reverse(Schedule.begin(), Schedule.end());
+
+  // restore units
+  for (auto &SU : SUnits)
+    SU = SUSavedCopy[SU.NodeNum];
+
+  return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
+                                              const ScheduleDAG &DAG) {
+  GCNILPScheduler S;
+  return S.schedule(BotRoots, DAG);
+}
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 9e743850f9a..942063d5f93 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -39,7 +39,9 @@ namespace llvm {
 std::vector<const SUnit *> makeMinRegSchedule(ArrayRef<const SUnit *> TopRoots,
                                               const ScheduleDAG &DAG);
 
-} // end namespace llvm
+  std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots,
+    const ScheduleDAG &DAG);
+}
 
 // shim accessors for different order containers
 static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
@@ -141,6 +143,7 @@ class GCNIterativeScheduler::BuildDAG {
   GCNIterativeScheduler &Sch;
   SmallVector<SUnit *, 8> TopRoots;
 
+  SmallVector<SUnit*, 8> BotRoots;
 public:
   BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
     : Sch(_Sch) {
@@ -151,8 +154,6 @@ public:
     Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
                         /*TrackLaneMask*/true);
     Sch.Topo.InitDAGTopologicalSorting();
-
-    SmallVector<SUnit *, 8> BotRoots;
     Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
   }
 
@@ -164,6 +165,9 @@ public:
   ArrayRef<const SUnit *> getTopRoots() const {
     return TopRoots;
   }
+  ArrayRef<SUnit*> getBottomRoots() const {
+    return BotRoots;
+  }
 };
 
 class GCNIterativeScheduler::OverrideLegacyStrategy {
@@ -323,6 +327,7 @@ void GCNIterativeScheduler::finalizeSchedule() { // overriden
   case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
   case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
   case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+  case SCHEDULE_ILP: scheduleILP(false); break;
   }
 }
 
@@ -553,3 +558,43 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
     MaxPressure = RP;
   }
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// ILP scheduler port
+
+void GCNIterativeScheduler::scheduleILP(
+  bool TryMaximizeOccupancy) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF),
+                         ST.getWavesPerEU(*MF.getFunction()).second);
+
+  sortRegionsByPressure(TgtOcc);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+  if (TryMaximizeOccupancy && Occ < TgtOcc)
+    Occ = tryMaximizeOccupancy(TgtOcc);
+
+  TgtOcc = std::min(Occ, TgtOcc);
+  DEBUG(dbgs() << "Scheduling using default scheduler, "
+    "target occupancy = " << TgtOcc << '\n');
+
+  for (auto R : Regions) {
+    BuildDAG DAG(*R, *this);
+    const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
+
+    const auto RP = getSchedulePressure(*R, ILPSchedule);
+    DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+    if (RP.getOccupancy(ST) < TgtOcc) {
+      DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+      if (R->BestSchedule.get() &&
+        R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+        DEBUG(dbgs() << ", scheduling minimal register\n");
+        scheduleBest(*R);
+      }
+    } else {
+      scheduleRegion(*R, ILPSchedule, RP);
+      DEBUG(printSchedResult(dbgs(), R, RP));
+    }
+  }
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index d002fe66827..14ef5147f32 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -32,7 +32,8 @@ public:
   enum StrategyKind {
     SCHEDULE_MINREGONLY,
     SCHEDULE_MINREGFORCED,
-    SCHEDULE_LEGACYMAXOCCUPANCY
+    SCHEDULE_LEGACYMAXOCCUPANCY,
+    SCHEDULE_ILP
   };
 
   GCNIterativeScheduler(MachineSchedContext *C,
@@ -108,6 +109,7 @@ protected:
 
   void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
   void scheduleMinReg(bool force = false);
+  void scheduleILP(bool TryMaximizeOccupancy = true);
 
   void printRegions(raw_ostream &OS) const;
   void printSchedResult(raw_ostream &OS,