diff options
Diffstat (limited to 'llvm/tools/llvm-mca/Views/BottleneckAnalysis.h')
| -rw-r--r-- | llvm/tools/llvm-mca/Views/BottleneckAnalysis.h | 132 |
1 files changed, 119 insertions, 13 deletions
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h index f8302496cef..7564b1a4820 100644 --- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h +++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h @@ -10,18 +10,71 @@ /// This file implements the bottleneck analysis view. /// /// This view internally observes backend pressure increase events in order to -/// identify potential sources of bottlenecks. +/// identify problematic data dependencies and processor resource interferences. /// -/// Example of bottleneck analysis report: +/// Example of bottleneck analysis report for a dot-product on X86 btver2: /// -/// Cycles with backend pressure increase [ 33.40% ] -/// Throughput Bottlenecks: -/// Resource Pressure [ 0.52% ] -/// - JLAGU [ 0.52% ] -/// Data Dependencies: [ 32.88% ] -/// - Register Dependencies [ 32.88% ] -/// - Memory Dependencies [ 0.00% ] +/// Cycles with backend pressure increase [ 40.76% ] +/// Throughput Bottlenecks: +/// Resource Pressure [ 39.34% ] +/// - JFPA [ 39.34% ] +/// - JFPU0 [ 39.34% ] +/// Data Dependencies: [ 1.42% ] +/// - Register Dependencies [ 1.42% ] +/// - Memory Dependencies [ 0.00% ] /// +/// According to the example, backend pressure increased during the 40.76% of +/// the simulated cycles. In particular, the major cause of backend pressure +/// increases was the contention on floating point adder JFPA accessible from +/// pipeline resource JFPU0. +/// +/// At the end of each cycle, if pressure on the simulated out-of-order buffers +/// has increased, a backend pressure event is reported. +/// In particular, this occurs when there is a delta between the number of uOps +/// dispatched and the number of uOps issued to the underlying pipelines. +/// +/// The bottleneck analysis view is also responsible for identifying and printing +/// the most "critical" sequence of dependent instructions according to the +/// simulated run. +/// +/// Below is the critical sequence computed for the dot-product example on +/// btver2: +/// +/// Instruction Dependency Information +/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 +/// | +/// | < loop carried > +/// | +/// | 0. vmulps %xmm0, %xmm0, %xmm2 +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 +/// | +/// | < loop carried > +/// | +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +/// +/// The algorithm that computes the critical sequence is very similar to a +/// critical path analysis. +/// +/// A dependency graph is used internally to track dependencies between nodes. +/// Nodes of the graph represent instructions from the input assembly sequence, +/// and edges of the graph represent data dependencies or processor resource +/// interferences. +/// +/// Edges are dynamically 'discovered' by observing instruction state transitions +/// and backend pressure increase events. Edges are internally ranked based on +/// their "criticality". A dependency is considered to be critical if it takes a +/// long time to execute, and if it contributes to backend pressure increases. +/// Criticality is internally measured in terms of cycles; it is computed for +/// every edge in the graph as a function of the edge latency and the number of +/// backend pressure increase cycles contributed by that edge. +/// +/// At the end of simulation, costs are propagated to nodes through the edges of +/// the graph, and the most expensive path connecting the root-set (a +/// set of nodes with no predecessors) to a leaf node is reported as critical +/// sequence. +// //===----------------------------------------------------------------------===// #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H @@ -108,6 +161,13 @@ public: return Info.ResourcePressureCycles; } + const char *resolveResourceName(uint64_t ResourceMask) const { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + return PRDesc.Name; + } + void onInstructionDispatched(unsigned IID); void onInstructionExecuted(unsigned IID); @@ -115,14 +175,13 @@ public: void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); }; -// An edge of a dependency graph. -// Vertices of the graph are instructions identified by their ID. +// A dependency edge. struct DependencyEdge { enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; // Dependency edge descriptor. // - // It describe the dependency reason, as well as the edge cost in cycles. + // It specifies the dependency type, as well as the edge cost in cycles. struct Dependency { DependencyType Type; uint64_t ResourceOrRegID; @@ -130,14 +189,43 @@ struct DependencyEdge { }; Dependency Dep; - // Pair of vertices connected by this edge. unsigned FromIID; unsigned ToIID; + + // Used by the bottleneck analysis to compute the interference + // probability for processor resources. + unsigned Frequency; }; +// A dependency graph used by the bottleneck analysis to describe data +// dependencies and processor resource interferences between instructions. +// +// There is a node (an instance of struct DGNode) for every instruction in the +// input assembly sequence. Edges of the graph represent dependencies between +// instructions. +// +// Each edge of the graph is associated with a cost value which is used +// internally to rank dependency based on their impact on the runtime +// performance (see field DependencyEdge::Dependency::Cost). In general, the +// higher the cost of an edge, the higher the impact on performance. +// +// The cost of a dependency is a function of both the latency and the number of +// cycles where the dependency has been seen as critical (i.e. contributing to +// back-pressure increases). +// +// Loop carried dependencies are carefully expanded by the bottleneck analysis +// to guarantee that the graph stays acyclic. To this end, extra nodes are +// pre-allocated at construction time to describe instructions from "past and +// future" iterations. The graph is kept acyclic mainly because it simplifies the +// complexity of the algorithm that computes the critical sequence. class DependencyGraph { struct DGNode { unsigned NumPredecessors; + unsigned NumVisitedPredecessors; + uint64_t Cost; + unsigned Depth; + + DependencyEdge CriticalPredecessor; SmallVector<DependencyEdge, 8> OutgoingEdges; }; SmallVector<DGNode, 16> Nodes; @@ -148,6 +236,9 @@ class DependencyGraph { void addDependency(unsigned From, unsigned To, DependencyEdge::Dependency &&DE); + void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; + void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet); + #ifndef NDEBUG void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, MCInstPrinter &MCIP) const; @@ -170,6 +261,19 @@ public: addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); } + // Called by the bottleneck analysis at the end of simulation to propagate + // costs through the edges of the graph, and compute a critical path. + void finalizeGraph() { + SmallVector<unsigned, 16> RootSet; + initializeRootSet(RootSet); + propagateThroughEdges(RootSet); + } + + // Returns a sequence of edges representing the critical sequence based on the + // simulated run. It assumes that the graph has already been finalized (i.e. + // method `finalizeGraph()` has already been called on this graph). + void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; + #ifndef NDEBUG void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; #endif @@ -178,6 +282,7 @@ public: /// A view that collects and prints a few performance numbers. class BottleneckAnalysis : public View { const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; PressureTracker Tracker; DependencyGraph DG; @@ -212,6 +317,7 @@ class BottleneckAnalysis : public View { // Prints a bottleneck message to OS. void printBottleneckHints(raw_ostream &OS) const; + void printCriticalSequence(raw_ostream &OS) const; public: BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, |

