[MCA] Improved cost computation for loop carried dependencies in the bottleneck analysis.

This patch introduces a cut-off threshold for dependency edge frequences with the goal of simplifying the critical sequence computation. This patch also removes the cost normalization for loop carried dependencies. We didn't really need to artificially amplify the cost of loop-carried dependencies since it is already computed as the integral over time of the delay (in cycle). In the absence of backend stalls there is no need for computing a critical sequence. With this patch we early exit from the critical sequence computation if no bottleneck was reported during the simulation. llvm-svn: 372337
author: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> 2019-09-19 16:05:11 +0000
committer: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> 2019-09-19 16:05:11 +0000
commit: e0900f285bb532790ed494df901f87c5c8b904da (patch)
tree: af3d795ab05e1ce2b7ebbb14f24c1c406892e845
parent: 7cb60fb00f545dae7e161d8b7c24d674d7d008f9 (diff)
download: bcm5719-llvm-e0900f285bb532790ed494df901f87c5c8b904da.tar.gz
bcm5719-llvm-e0900f285bb532790ed494df901f87c5c8b904da.zip
3 files changed, 43 insertions, 17 deletions
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
index 1f5e2dbf469..98c5376ff82 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
@@ -67,18 +67,14 @@
 # CHECK-NEXT:  |      11.   vmovaps	%xmm7, %xmm4
 # CHECK-NEXT:  +----> 12.   vfmadd213ps	%xmm1, %xmm14, %xmm2      ## REGISTER dependency:  %xmm1
 # CHECK-NEXT:  |      13.   vmovaps	%xmm6, %xmm1
-# CHECK-NEXT:  |      14.   vfmadd213ps	%xmm2, %xmm15, %xmm3
-# CHECK-NEXT:  |      15.   vpermilps	$170, %xmm3, %xmm0
+# CHECK-NEXT:  +----> 14.   vfmadd213ps	%xmm2, %xmm15, %xmm3      ## REGISTER dependency:  %xmm2
+# CHECK-NEXT:  +----> 15.   vpermilps	$170, %xmm3, %xmm0                ## REGISTER dependency:  %xmm3
 # CHECK-NEXT:  |      16.   vmovups	%xmm3, (%rdx,%rax)
 # CHECK-NEXT:  |      17.   vpermilps	$255, %xmm3, %xmm2
 # CHECK-NEXT:  |      18.   addq	$16, %rax
 # CHECK-NEXT:  |      19.   decl	%ecx
-# CHECK-NEXT:  |      20.   vmovaps	%xmm0, %xmm3
-# CHECK-NEXT:  |      21.   jne	.LBB0_4
-# CHECK-NEXT:  |
-# CHECK-NEXT:  |    < loop carried >
-# CHECK-NEXT:  |
-# CHECK-NEXT:  +----> 2.    vmulps	-24(%rsp), %xmm7, %xmm8           ## RESOURCE interference:  SKLPort1 [ probability: 45% ]
+# CHECK-NEXT:  +----> 20.   vmovaps	%xmm0, %xmm3                      ## REGISTER dependency:  %xmm0
+# CHECK-NEXT:         21.   jne	.LBB0_4
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 560c6c6e8a3..feff0cd6d52 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
            "Unsupported dependency type!");
     OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
   }
-  OS << " - CYCLES: " << DE.Cost << '\n';
+  OS << " - COST: " << DE.Cost << '\n';
 }
 #endif // NDEBUG
 
+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
 void DependencyGraph::initializeRootSet(
     SmallVectorImpl<unsigned> &RootSet) const {
   for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
 }
 
 void DependencyGraph::propagateThroughEdges(
-    SmallVectorImpl<unsigned> &RootSet) {
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
   SmallVector<unsigned, 8> ToVisit;
 
   // A critical sequence is computed as the longest path from a node of the
@@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges(
   // Each node of the graph starts with an initial default cost of zero.  The
   // cost of a node is a measure of criticality: the higher the cost, the bigger
   // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
   //
   // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
   // the inner loop selects (i.e. visits) a node N from a set of `unvisited
@@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS,
 }
 
 void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
   SmallVector<const DependencyEdge *, 16> Seq;
   DG.getCriticalSequence(Seq);
   if (Seq.empty())
@@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
     DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
     return;
@@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addMemoryDep(From, To + SourceSize, Cost);
     DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
     return;
@@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addResourceDep(From, To + SourceSize, Mask, Cost);
     DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
     return;
@@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
 
   // Check if this is the last simulated instruction.
   if (IID == ((Iterations * Source.size()) - 1))
-    DG.finalizeGraph();
+    DG.finalizeGraph(Iterations);
 }
 
 void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
index 7564b1a4820..9e3bd5978f0 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -236,8 +236,9 @@ class DependencyGraph {
   void addDependency(unsigned From, unsigned To,
                      DependencyEdge::Dependency &&DE);
 
+  void pruneEdges(unsigned Iterations);
   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
-  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 
 #ifndef NDEBUG
   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@@ -263,10 +264,11 @@ public:
 
   // Called by the bottleneck analysis at the end of simulation to propagate
   // costs through the edges of the graph, and compute a critical path.
-  void finalizeGraph() {
+  void finalizeGraph(unsigned Iterations) {
     SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
     initializeRootSet(RootSet);
-    propagateThroughEdges(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
   }
 
   // Returns a sequence of edges representing the critical sequence based on the
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	2019-09-19 16:05:11 +0000
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	2019-09-19 16:05:11 +0000
commit	e0900f285bb532790ed494df901f87c5c8b904da (patch)
tree	af3d795ab05e1ce2b7ebbb14f24c1c406892e845
parent	7cb60fb00f545dae7e161d8b7c24d674d7d008f9 (diff)
download	bcm5719-llvm-e0900f285bb532790ed494df901f87c5c8b904da.tar.gz bcm5719-llvm-e0900f285bb532790ed494df901f87c5c8b904da.zip