MI-Sched: handle latency of in-order operations with the new machine model.

The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | llvm-svn: 196516
author: Andrew Trick <atrick@apple.com> 2013-12-05 17:55:58 +0000
committer: Andrew Trick <atrick@apple.com> 2013-12-05 17:55:58 +0000
commit: 880e573d98a404f9c1e4460df9527e97cdc6723c (patch)
tree: 684869ec8ffcce73f3587cfc02c070b4add303dd /llvm/lib/CodeGen
parent: 093bdd1735d3bcf23957392c4d92ded0074b8b37 (diff)
download: bcm5719-llvm-880e573d98a404f9c1e4460df9527e97cdc6723c.tar.gz
bcm5719-llvm-880e573d98a404f9c1e4460df9527e97cdc6723c.zip
2 files changed, 52 insertions, 5 deletions
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f8c4a893c12..3296149afa8 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1330,7 +1330,7 @@ public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason {
-    NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
+    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
     ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
     TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
 
@@ -1583,6 +1583,10 @@ public:
                       MaxExecutedResCount);
     }
 
+    /// Get the difference between the given SUnit's ready time and the current
+    /// cycle.
+    unsigned getLatencyStallCycles(SUnit *SU);
+
     bool checkHazard(SUnit *SU);
 
     unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
@@ -1869,6 +1873,23 @@ void GenericScheduler::registerRoots() {
   }
 }
 
+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
+/// these "soft stalls" differently than the hard stall cycles based on CPU
+/// resources and computed by checkHazard(). A fully in-order model
+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
+/// available for scheduling until they are ready. However, a weaker in-order
+/// model may use this for heuristics. For example, if a processor has in-order
+/// behavior when reading certain resources, this may come into play.
+unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) {
+  if (!SU->isUnbuffered)
+    return 0;
+
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  if (ReadyCycle > CurrCycle)
+    return ReadyCycle - CurrCycle;
+  return 0;
+}
+
 /// Does this SU have a hazard within the current instruction group.
 ///
 /// The scheduler supports two modes of hazard recognition. The first is the
@@ -1948,9 +1969,9 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 /// inside and outside the zone.
 void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
                                                    SchedBoundary &OtherZone) {
-  // Now that potential stalls have been considered, apply preemptive heuristics
-  // based on the the total latency and resources inside and outside this
-  // zone.
+  // Apply preemptive heuristics based on the the total latency and resources
+  // inside and outside this zone. Potential stalls should be considered before
+  // following this policy.
 
   // Compute remaining latency. We need this both to determine whether the
   // overall schedule has become latency-limited and whether the instructions
@@ -2141,7 +2162,11 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     break;
   default:
     // We don't currently model the OOO reorder buffer, so consider all
-    // scheduled MOps to be "retired".
+    // scheduled MOps to be "retired". We do loosely model in-order resource
+    // latency. If this instruction uses an in-order resource, account for any
+    // likely stall cycles.
+    if (SU->isUnbuffered && ReadyCycle > NextCycle)
+      NextCycle = ReadyCycle;
     break;
   }
   RetiredMOps += IncMOps;
@@ -2514,6 +2539,11 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
       && tryLatency(TryCand, Cand, Zone))
     return;
 
+  // Prioritize instructions that read unbuffered resources by stall cycles.
+  if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
+              Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    return;
+
   // Keep clustered nodes together to encourage downstream peephole
   // optimizations which may reduce resource requirements.
   //
@@ -2577,6 +2607,7 @@ const char *GenericScheduler::getReasonStr(
   case PhysRegCopy:    return "PREG-COPY";
   case RegExcess:      return "REG-EXCESS";
   case RegCritical:    return "REG-CRIT  ";
+  case Stall:          return "STALL     ";
   case Cluster:        return "CLUSTER   ";
   case Weak:           return "WEAK      ";
   case RegMax:         return "REG-MAX   ";
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 7d9160ab1f4..eeae6ec03d8 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -687,6 +687,22 @@ void ScheduleDAGInstrs::initSUnits() {
 
     // Assign the Latency field of SU using target-provided information.
     SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
+
+    // If this SUnit uses an unbuffered resource, mark it as such.
+    // These resources are used for in-order execution pipelines within an
+    // out-of-order core and are identified by BufferSize=1. BufferSize=0 is
+    // used for dispatch/issue groups and is not considered here.
+    if (SchedModel.hasInstrSchedModel()) {
+      const MCSchedClassDesc *SC = getSchedClass(SU);
+      for (TargetSchedModel::ProcResIter
+             PI = SchedModel.getWriteProcResBegin(SC),
+             PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+        if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) {
+          SU->isUnbuffered = true;
+          break;
+        }
+      }
+    }
   }
 }
author	Andrew Trick <atrick@apple.com>	2013-12-05 17:55:58 +0000
committer	Andrew Trick <atrick@apple.com>	2013-12-05 17:55:58 +0000
commit	880e573d98a404f9c1e4460df9527e97cdc6723c (patch)
tree	684869ec8ffcce73f3587cfc02c070b4add303dd /llvm/lib/CodeGen
parent	093bdd1735d3bcf23957392c4d92ded0074b8b37 (diff)
download	bcm5719-llvm-880e573d98a404f9c1e4460df9527e97cdc6723c.tar.gz bcm5719-llvm-880e573d98a404f9c1e4460df9527e97cdc6723c.zip