3 files changed, 99 insertions, 14 deletions
diff --git a/llvm/tools/llvm-mca/SummaryView.cpp b/llvm/tools/llvm-mca/SummaryView.cpp
index 511727bc750..9b6e1d9b183 100644
--- a/llvm/tools/llvm-mca/SummaryView.cpp
+++ b/llvm/tools/llvm-mca/SummaryView.cpp
@@ -14,6 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "SummaryView.h"
+#include "Support.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Format.h"
 
 namespace mca {
@@ -22,19 +24,83 @@ namespace mca {
 
 using namespace llvm;
 
+void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
+  // We are only interested in the "instruction dispatched" events generated by
+  // the dispatch stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  if (Event.IR.getSourceIndex() >= Source.size())
+    return;
+
+  // Update the cumulative number of resource cycles based on the processor
+  // resource usage information available from the instruction descriptor. We need to
+  // compute the cumulative number of resource cycles for every processor
+  // resource which is consumed by an instruction of the block.
+  const Instruction &Inst = *Event.IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+  NumMicroOps += Desc.NumMicroOps;
+  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
+    if (!RU.second.size())
+      continue;
+
+    assert(RU.second.NumUnits && "Expected more than one unit used!");
+    if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
+      ProcResourceUsage[RU.first] = RU.second.size();
+      continue;
+    }
+
+    ProcResourceUsage[RU.first] += RU.second.size();
+  }
+}
+
+double SummaryView::getBlockRThroughput() const {
+  assert(NumMicroOps && "Expected at least one micro opcode!");
+
+  SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
+  computeProcResourceMasks(SM, Masks);
+
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distributed, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    uint64_t Mask = Masks[I];
+    const auto It = ProcResourceUsage.find_as(Mask);
+    if (It != ProcResourceUsage.end()) {
+      const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+      unsigned NumUnits = MCDesc.NumUnits;
+      double Throughput = static_cast<double>(It->second) / NumUnits;
+      Max = std::max(Max, Throughput);
+    }
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  -  (#uOps / DispatchWidth)
+  //  -  (#units / resource cycles) for every consumed processor resource.
+  return Max;
+}
+
 void SummaryView::printView(raw_ostream &OS) const {
   unsigned Iterations = Source.getNumIterations();
   unsigned Instructions = Source.size();
   unsigned TotalInstructions = Instructions * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
+  double BlockRThroughput = getBlockRThroughput();
 
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
-  TempStream << "Iterations:     " << Iterations;
-  TempStream << "\nInstructions:   " << TotalInstructions;
-  TempStream << "\nTotal Cycles:   " << TotalCycles;
-  TempStream << "\nDispatch Width: " << DispatchWidth;
-  TempStream << "\nIPC:            " << format("%.2f", IPC) << '\n';
+  TempStream << "Iterations:        " << Iterations;
+  TempStream << "\nInstructions:      " << TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << TotalCycles;
+  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  TempStream << "\nIPC:               " << format("%.2f", IPC);
+  TempStream << "\nBlock RThroughput: " << format("%.1f", BlockRThroughput)
+             << '\n';
   TempStream.flush();
   OS << Buffer;
 }
diff --git a/llvm/tools/llvm-mca/SummaryView.h b/llvm/tools/llvm-mca/SummaryView.h
index 0484057fb10..fe8a5e20f9d 100644
--- a/llvm/tools/llvm-mca/SummaryView.h
+++ b/llvm/tools/llvm-mca/SummaryView.h
@@ -14,12 +14,12 @@
 /// performance throughput. Below is an example of summary view:
 ///
 ///
-/// Iterations:     300
-/// Instructions:   900
-/// Total Cycles:   610
-/// Dispatch Width: 2
-/// IPC:            1.48
-///
+/// Iterations:        300
+/// Instructions:      900
+/// Total Cycles:      610
+/// Dispatch Width:    2
+/// IPC:               1.48
+/// Block RThroughput: 2.0
 ///
 /// The summary view collects a few performance numbers. The two main
 /// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle).
@@ -31,22 +31,41 @@
 
 #include "SourceMgr.h"
 #include "View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace mca {
 
 /// A view that collects and prints a few performance numbers.
 class SummaryView : public View {
+  const llvm::MCSchedModel &SM;
   const SourceMgr &Source;
   const unsigned DispatchWidth;
   unsigned TotalCycles;
+  // The total number of micro opcodes contributed by a block of instructions.
+  unsigned NumMicroOps;
+  // For each processor resource, this map stores the cumulative number of
+  // resource cycles consumed by a block of instructions. The resource mask ID
+  // is used as the key value to access elements of this map.
+  llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage;
+
+  // Compute the reciprocal throughput for the analyzed code block.
+  // The reciprocal block throughput is computed as the MAX between:
+  //   - NumMicroOps / DispatchWidth
+  //   - Total Resource Cycles / #Units   (for every resource consumed).
+  double getBlockRThroughput() const;
 
 public:
-  SummaryView(const SourceMgr &S, unsigned Width)
-      : Source(S), DispatchWidth(Width), TotalCycles(0) {}
+  SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+              unsigned Width)
+      : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
+        NumMicroOps(0) {}
 
   void onCycleEnd() override { ++TotalCycles; }
 
+  void onInstructionEvent(const HWInstructionEvent &Event) override;
+
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 1e93a7fdcdb..925584b0193 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -495,7 +495,7 @@ int main(int argc, char **argv) {
                    LoadQueueSize, StoreQueueSize, AssumeNoAlias);
     mca::BackendPrinter Printer(B);
 
-    Printer.addView(llvm::make_unique<mca::SummaryView>(S, Width));
+    Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width));
     if (PrintInstructionInfoView)
       Printer.addView(
           llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));