summaryrefslogtreecommitdiffstats
path: root/llvm/tools/llvm-mca/Views
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/tools/llvm-mca/Views')
-rw-r--r--llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp262
-rw-r--r--llvm/tools/llvm-mca/Views/BottleneckAnalysis.h132
2 files changed, 355 insertions, 39 deletions
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 8c825271e4f..560c6c6e8a3 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MCA/Support.h"
#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
namespace llvm {
namespace mca {
@@ -161,12 +162,219 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
OS << " - MEMORY";
} else {
assert(DE.Type == DependencyEdge::DT_RESOURCE &&
- "Unexpected unsupported dependency type!");
+ "Unsupported dependency type!");
OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
}
OS << " - CYCLES: " << DE.Cost << '\n';
}
+#endif // NDEBUG
+
+void DependencyGraph::initializeRootSet(
+ SmallVectorImpl<unsigned> &RootSet) const {
+ for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
+ const DGNode &N = Nodes[I];
+ if (N.NumPredecessors == 0 && !N.OutgoingEdges.empty())
+ RootSet.emplace_back(I);
+ }
+}
+
+void DependencyGraph::propagateThroughEdges(
+ SmallVectorImpl<unsigned> &RootSet) {
+ SmallVector<unsigned, 8> ToVisit;
+
+ // A critical sequence is computed as the longest path from a node of the
+ // RootSet to a leaf node (i.e. a node with no successors). The RootSet is
+ // composed of nodes with at least one successor, and no predecessors.
+ //
+ // Each node of the graph starts with an initial default cost of zero. The
+ // cost of a node is a measure of criticality: the higher the cost, the bigger
+ // is the performance impact.
+ //
+ // This algorithm is very similar to a (reverse) Dijkstra. Every iteration of
+ // the inner loop selects (i.e. visits) a node N from a set of `unvisited
+ // nodes`, and then propagates the cost of N to all its neighbors.
+ //
+ // The `unvisited nodes` set initially contains all the nodes from the
+ // RootSet. A node N is added to the `unvisited nodes` if all its
+ // predecessors have been visited already.
+ //
+ // For simplicity, every node tracks the number of unvisited incoming edges in
+ // field `NumVisitedPredecessors`. When the value of that field drops to
+ // zero, then the corresponding node is added to a `ToVisit` set.
+ //
+ // At the end of every iteration of the outer loop, set `ToVisit` becomes our
+ // new `unvisited nodes` set.
+ //
+ // The algorithm terminates when the set of unvisited nodes (i.e. our RootSet)
+ // is empty. This algorithm works under the assumption that the graph is
+ // acyclic.
+ do {
+ for (unsigned IID : RootSet) {
+ const DGNode &N = Nodes[IID];
+ for (const DependencyEdge &DepEdge : N.OutgoingEdges) {
+ unsigned ToIID = DepEdge.ToIID;
+ DGNode &To = Nodes[ToIID];
+ uint64_t Cost = N.Cost + DepEdge.Dep.Cost;
+ // Check if this is the most expensive incoming edge seen so far. In
+ // case, update the total cost of the destination node (ToIID), as well
+ // its field `CriticalPredecessor`.
+ if (Cost > To.Cost) {
+ To.CriticalPredecessor = DepEdge;
+ To.Cost = Cost;
+ To.Depth = N.Depth + 1;
+ }
+ To.NumVisitedPredecessors++;
+ if (To.NumVisitedPredecessors == To.NumPredecessors)
+ ToVisit.emplace_back(ToIID);
+ }
+ }
+
+ std::swap(RootSet, ToVisit);
+ ToVisit.clear();
+ } while (!RootSet.empty());
+}
+
+void DependencyGraph::getCriticalSequence(
+ SmallVectorImpl<const DependencyEdge *> &Seq) const {
+ // At this stage, nodes of the graph have been already visited, and costs have
+ // been propagated through the edges (see method `propagateThroughEdges()`).
+
+ // Identify the node N with the highest cost in the graph. By construction,
+ // that node is the last instruction of our critical sequence.
+ // Field N.Depth would tell us the total length of the sequence.
+ //
+ // To obtain the sequence of critical edges, we simply follow the chain of critical
+ // predecessors starting from node N (field DGNode::CriticalPredecessor).
+ const auto It = std::max_element(
+ Nodes.begin(), Nodes.end(),
+ [](const DGNode &Lhs, const DGNode &Rhs) { return Lhs.Cost < Rhs.Cost; });
+ unsigned IID = std::distance(Nodes.begin(), It);
+ Seq.resize(Nodes[IID].Depth);
+ for (unsigned I = Seq.size(), E = 0; I > E; --I) {
+ const DGNode &N = Nodes[IID];
+ Seq[I - 1] = &N.CriticalPredecessor;
+ IID = N.CriticalPredecessor.FromIID;
+ }
+}
+
+static void printInstruction(formatted_raw_ostream &FOS,
+ const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
+ const MCInst &MCI,
+ bool UseDifferentColor = false) {
+ std::string Instruction;
+ raw_string_ostream InstrStream(Instruction);
+
+ FOS.PadToColumn(14);
+
+ MCIP.printInst(&MCI, InstrStream, "", STI);
+ InstrStream.flush();
+
+ if (UseDifferentColor)
+ FOS.changeColor(raw_ostream::CYAN, true, false);
+ FOS << StringRef(Instruction).ltrim();
+ if (UseDifferentColor)
+ FOS.resetColor();
+}
+
+void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+ SmallVector<const DependencyEdge *, 16> Seq;
+ DG.getCriticalSequence(Seq);
+ if (Seq.empty())
+ return;
+
+ OS << "\nCritical sequence based on the simulation:\n\n";
+
+ const DependencyEdge &FirstEdge = *Seq[0];
+ unsigned FromIID = FirstEdge.FromIID % Source.size();
+ unsigned ToIID = FirstEdge.ToIID % Source.size();
+ bool IsLoopCarried = FromIID >= ToIID;
+
+ formatted_raw_ostream FOS(OS);
+ FOS.PadToColumn(14);
+ FOS << "Instruction";
+ FOS.PadToColumn(58);
+ FOS << "Dependency Information";
+
+ bool HasColors = FOS.has_colors();
+ unsigned CurrentIID = 0;
+ if (IsLoopCarried) {
+ FOS << "\n +----< " << FromIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors);
+ FOS << "\n |\n | < loop carried > \n |";
+ } else {
+ while (CurrentIID < FromIID) {
+ FOS << "\n " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ FOS << "\n +----< " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+ CurrentIID++;
+ }
+
+ for (const DependencyEdge *&DE : Seq) {
+ ToIID = DE->ToIID % Source.size();
+ unsigned LastIID = CurrentIID > ToIID ? Source.size() : ToIID;
+
+ while (CurrentIID < LastIID) {
+ FOS << "\n | " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ if (CurrentIID == ToIID) {
+ FOS << "\n +----> " << ToIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+ } else {
+ FOS << "\n |\n | < loop carried > \n |"
+ << "\n +----> " << ToIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors);
+ }
+ FOS.PadToColumn(58);
+
+ const DependencyEdge::Dependency &Dep = DE->Dep;
+ if (HasColors)
+ FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+
+ if (Dep.Type == DependencyEdge::DT_REGISTER) {
+ FOS << "## REGISTER dependency: ";
+ if (HasColors)
+ FOS.changeColor(raw_ostream::MAGENTA, true, false);
+ MCIP.printRegName(FOS, Dep.ResourceOrRegID);
+ } else if (Dep.Type == DependencyEdge::DT_MEMORY) {
+ FOS << "## MEMORY dependency.";
+ } else {
+ assert(Dep.Type == DependencyEdge::DT_RESOURCE &&
+ "Unsupported dependency type!");
+ FOS << "## RESOURCE interference: ";
+ if (HasColors)
+ FOS.changeColor(raw_ostream::MAGENTA, true, false);
+ FOS << Tracker.resolveResourceName(Dep.ResourceOrRegID);
+ if (HasColors) {
+ FOS.resetColor();
+ FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+ }
+ FOS << " [ probability: " << ((DE->Frequency * 100) / Iterations)
+ << "% ]";
+ }
+ if (HasColors)
+ FOS.resetColor();
+ ++CurrentIID;
+ }
+
+ while (CurrentIID < Source.size()) {
+ FOS << "\n " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ FOS << '\n';
+ FOS.flush();
+}
+
+#ifndef NDEBUG
void DependencyGraph::dump(raw_ostream &OS, MCInstPrinter &MCIP) const {
OS << "\nREG DEPS\n";
for (const DGNode &Node : Nodes)
@@ -200,10 +408,11 @@ void DependencyGraph::addDependency(unsigned From, unsigned To,
if (It != Vec.end()) {
It->Dep.Cost += Dep.Cost;
+ It->Frequency++;
return;
}
- DependencyEdge DE = {Dep, From, To};
+ DependencyEdge DE = {Dep, From, To, 1};
Vec.emplace_back(DE);
NodeTo.NumPredecessors++;
}
@@ -211,7 +420,7 @@ void DependencyGraph::addDependency(unsigned From, unsigned To,
BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti,
MCInstPrinter &Printer,
ArrayRef<MCInst> S, unsigned NumIter)
- : STI(sti), Tracker(STI.getSchedModel()), DG(S.size() * 3),
+ : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3),
Source(S), Iterations(NumIter), TotalCycles(0),
PressureIncreasedBecauseOfResources(false),
PressureIncreasedBecauseOfRegisterDependencies(false),
@@ -274,38 +483,38 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
const Instruction &IS = *Event.IR.getInstruction();
unsigned To = IID % Source.size();
- unsigned Cycles = Tracker.getResourcePressureCycles(IID);
- if (Cycles) {
- uint64_t ResourceMask = IS.getCriticalResourceMask();
- SmallVector<std::pair<unsigned, unsigned>, 4> Users;
- while (ResourceMask) {
- uint64_t Current = ResourceMask & (-ResourceMask);
- Tracker.getResourceUsers(Current, Users);
- for (const std::pair<unsigned, unsigned> &U : Users) {
- unsigned Cost = std::min(U.second, Cycles);
- addResourceDep(U.first % Source.size(), To, Current, Cost);
- }
- Users.clear();
- ResourceMask ^= Current;
- }
+ unsigned Cycles = 2 * Tracker.getResourcePressureCycles(IID);
+ uint64_t ResourceMask = IS.getCriticalResourceMask();
+ SmallVector<std::pair<unsigned, unsigned>, 4> Users;
+ while (ResourceMask) {
+ uint64_t Current = ResourceMask & (-ResourceMask);
+ Tracker.getResourceUsers(Current, Users);
+ for (const std::pair<unsigned, unsigned> &U : Users)
+ addResourceDep(U.first % Source.size(), To, Current, U.second + Cycles);
+ Users.clear();
+ ResourceMask ^= Current;
}
- Cycles = Tracker.getRegisterPressureCycles(IID);
- if (Cycles) {
- const CriticalDependency &RegDep = IS.getCriticalRegDep();
+ const CriticalDependency &RegDep = IS.getCriticalRegDep();
+ if (RegDep.Cycles) {
+ Cycles = RegDep.Cycles + 2 * Tracker.getRegisterPressureCycles(IID);
unsigned From = RegDep.IID % Source.size();
addRegisterDep(From, To, RegDep.RegID, Cycles);
}
- Cycles = Tracker.getMemoryPressureCycles(IID);
- if (Cycles) {
- const CriticalDependency &MemDep = IS.getCriticalMemDep();
+ const CriticalDependency &MemDep = IS.getCriticalMemDep();
+ if (MemDep.Cycles) {
+ Cycles = MemDep.Cycles + 2 * Tracker.getMemoryPressureCycles(IID);
unsigned From = MemDep.IID % Source.size();
addMemoryDep(From, To, Cycles);
}
Tracker.handleInstructionIssuedEvent(
static_cast<const HWInstructionIssuedEvent &>(Event));
+
+ // Check if this is the last simulated instruction.
+ if (IID == ((Iterations * Source.size()) - 1))
+ DG.finalizeGraph();
}
void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
@@ -356,7 +565,7 @@ void BottleneckAnalysis::onCycleEnd() {
void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
if (!SeenStallCycles || !BPI.PressureIncreaseCycles) {
- OS << "\nNo resource or data dependency bottlenecks discovered.\n";
+ OS << "\n\nNo resource or data dependency bottlenecks discovered.\n";
return;
}
@@ -370,7 +579,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
double MemDepPressurePerCycle =
(double)BPI.MemoryDependencyCycles * 100 / TotalCycles;
- OS << "\nCycles with backend pressure increase [ "
+ OS << "\n\nCycles with backend pressure increase [ "
<< format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";
OS << "\nThroughput Bottlenecks: "
@@ -399,7 +608,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
<< "% ]";
OS << "\n - Memory Dependencies [ "
<< format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
- << "% ]\n\n";
+ << "% ]\n";
}
void BottleneckAnalysis::printView(raw_ostream &OS) const {
@@ -408,6 +617,7 @@ void BottleneckAnalysis::printView(raw_ostream &OS) const {
printBottleneckHints(TempStream);
TempStream.flush();
OS << Buffer;
+ printCriticalSequence(OS);
}
} // namespace mca.
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
index f8302496cef..7564b1a4820 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -10,18 +10,71 @@
/// This file implements the bottleneck analysis view.
///
/// This view internally observes backend pressure increase events in order to
-/// identify potential sources of bottlenecks.
+/// identify problematic data dependencies and processor resource interferences.
///
-/// Example of bottleneck analysis report:
+/// Example of bottleneck analysis report for a dot-product on X86 btver2:
///
-/// Cycles with backend pressure increase [ 33.40% ]
-/// Throughput Bottlenecks:
-/// Resource Pressure [ 0.52% ]
-/// - JLAGU [ 0.52% ]
-/// Data Dependencies: [ 32.88% ]
-/// - Register Dependencies [ 32.88% ]
-/// - Memory Dependencies [ 0.00% ]
+/// Cycles with backend pressure increase [ 40.76% ]
+/// Throughput Bottlenecks:
+/// Resource Pressure [ 39.34% ]
+/// - JFPA [ 39.34% ]
+/// - JFPU0 [ 39.34% ]
+/// Data Dependencies: [ 1.42% ]
+/// - Register Dependencies [ 1.42% ]
+/// - Memory Dependencies [ 0.00% ]
///
+/// According to the example, backend pressure increased during the 40.76% of
+/// the simulated cycles. In particular, the major cause of backend pressure
+/// increases was the contention on floating point adder JFPA accessible from
+/// pipeline resource JFPU0.
+///
+/// At the end of each cycle, if pressure on the simulated out-of-order buffers
+/// has increased, a backend pressure event is reported.
+/// In particular, this occurs when there is a delta between the number of uOps
+/// dispatched and the number of uOps issued to the underlying pipelines.
+///
+/// The bottleneck analysis view is also responsible for identifying and printing
+/// the most "critical" sequence of dependent instructions according to the
+/// simulated run.
+///
+/// Below is the critical sequence computed for the dot-product example on
+/// btver2:
+///
+/// Instruction Dependency Information
+/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
+/// |
+/// | < loop carried >
+/// |
+/// | 0. vmulps %xmm0, %xmm0, %xmm2
+/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
+/// |
+/// | < loop carried >
+/// |
+/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+///
+///
+/// The algorithm that computes the critical sequence is very similar to a
+/// critical path analysis.
+///
+/// A dependency graph is used internally to track dependencies between nodes.
+/// Nodes of the graph represent instructions from the input assembly sequence,
+/// and edges of the graph represent data dependencies or processor resource
+/// interferences.
+///
+/// Edges are dynamically 'discovered' by observing instruction state transitions
+/// and backend pressure increase events. Edges are internally ranked based on
+/// their "criticality". A dependency is considered to be critical if it takes a
+/// long time to execute, and if it contributes to backend pressure increases.
+/// Criticality is internally measured in terms of cycles; it is computed for
+/// every edge in the graph as a function of the edge latency and the number of
+/// backend pressure increase cycles contributed by that edge.
+///
+/// At the end of simulation, costs are propagated to nodes through the edges of
+/// the graph, and the most expensive path connecting the root-set (a
+/// set of nodes with no predecessors) to a leaf node is reported as critical
+/// sequence.
+//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
@@ -108,6 +161,13 @@ public:
return Info.ResourcePressureCycles;
}
+ const char *resolveResourceName(uint64_t ResourceMask) const {
+ unsigned Index = getResourceStateIndex(ResourceMask);
+ unsigned ProcResID = ResIdx2ProcResID[Index];
+ const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+ return PRDesc.Name;
+ }
+
void onInstructionDispatched(unsigned IID);
void onInstructionExecuted(unsigned IID);
@@ -115,14 +175,13 @@ public:
void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
};
-// An edge of a dependency graph.
-// Vertices of the graph are instructions identified by their ID.
+// A dependency edge.
struct DependencyEdge {
enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
// Dependency edge descriptor.
//
- // It describe the dependency reason, as well as the edge cost in cycles.
+ // It specifies the dependency type, as well as the edge cost in cycles.
struct Dependency {
DependencyType Type;
uint64_t ResourceOrRegID;
@@ -130,14 +189,43 @@ struct DependencyEdge {
};
Dependency Dep;
- // Pair of vertices connected by this edge.
unsigned FromIID;
unsigned ToIID;
+
+ // Used by the bottleneck analysis to compute the interference
+ // probability for processor resources.
+ unsigned Frequency;
};
+// A dependency graph used by the bottleneck analysis to describe data
+// dependencies and processor resource interferences between instructions.
+//
+// There is a node (an instance of struct DGNode) for every instruction in the
+// input assembly sequence. Edges of the graph represent dependencies between
+// instructions.
+//
+// Each edge of the graph is associated with a cost value which is used
+// internally to rank dependency based on their impact on the runtime
+// performance (see field DependencyEdge::Dependency::Cost). In general, the
+// higher the cost of an edge, the higher the impact on performance.
+//
+// The cost of a dependency is a function of both the latency and the number of
+// cycles where the dependency has been seen as critical (i.e. contributing to
+// back-pressure increases).
+//
+// Loop carried dependencies are carefully expanded by the bottleneck analysis
+// to guarantee that the graph stays acyclic. To this end, extra nodes are
+// pre-allocated at construction time to describe instructions from "past and
+// future" iterations. The graph is kept acyclic mainly because it simplifies the
+// complexity of the algorithm that computes the critical sequence.
class DependencyGraph {
struct DGNode {
unsigned NumPredecessors;
+ unsigned NumVisitedPredecessors;
+ uint64_t Cost;
+ unsigned Depth;
+
+ DependencyEdge CriticalPredecessor;
SmallVector<DependencyEdge, 8> OutgoingEdges;
};
SmallVector<DGNode, 16> Nodes;
@@ -148,6 +236,9 @@ class DependencyGraph {
void addDependency(unsigned From, unsigned To,
DependencyEdge::Dependency &&DE);
+ void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
+ void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+
#ifndef NDEBUG
void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
MCInstPrinter &MCIP) const;
@@ -170,6 +261,19 @@ public:
addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
}
+ // Called by the bottleneck analysis at the end of simulation to propagate
+ // costs through the edges of the graph, and compute a critical path.
+ void finalizeGraph() {
+ SmallVector<unsigned, 16> RootSet;
+ initializeRootSet(RootSet);
+ propagateThroughEdges(RootSet);
+ }
+
+ // Returns a sequence of edges representing the critical sequence based on the
+ // simulated run. It assumes that the graph has already been finalized (i.e.
+ // method `finalizeGraph()` has already been called on this graph).
+ void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
+
#ifndef NDEBUG
void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
#endif
@@ -178,6 +282,7 @@ public:
/// A view that collects and prints a few performance numbers.
class BottleneckAnalysis : public View {
const MCSubtargetInfo &STI;
+ MCInstPrinter &MCIP;
PressureTracker Tracker;
DependencyGraph DG;
@@ -212,6 +317,7 @@ class BottleneckAnalysis : public View {
// Prints a bottleneck message to OS.
void printBottleneckHints(raw_ostream &OS) const;
+ void printCriticalSequence(raw_ostream &OS) const;
public:
BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
OpenPOWER on IntegriCloud