[llvm-mca] Print the "Block RThroughput" in the SummaryView.

This patch implements the "block reciprocal throughput" computation in the SummaryView. The block reciprocal throughput is computed as the MAX of: - NumMicroOps / DispatchWidth - Resource Cycles / #Units (for every resource consumed). The block throughput is bounded from above by the hardware dispatch throughput. That is because the DispatchWidth is an upper bound on how many opcodes can be part of a single dispatch group. The block throughput is also limited by the amount of hardware parallelism. The number of available resource units affects how the resource pressure is distributed, and also how many blocks can be delivered every cycle. llvm-svn: 333095
author: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> 2018-05-23 15:59:27 +0000
committer: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> 2018-05-23 15:59:27 +0000
commit: 3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7 (patch)
tree: 41ae64000d1d49ec080e8ecbf4dd14e74ac8071e /llvm/tools
parent: feb3146d4b3e7306be6611532f1219859bbef54e (diff)
download: bcm5719-llvm-3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7.tar.gz
bcm5719-llvm-3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7.zip
3 files changed, 99 insertions, 14 deletions
diff --git a/llvm/tools/llvm-mca/SummaryView.cpp b/llvm/tools/llvm-mca/SummaryView.cpp
index 511727bc750..9b6e1d9b183 100644
--- a/llvm/tools/llvm-mca/SummaryView.cpp
+++ b/llvm/tools/llvm-mca/SummaryView.cpp
@@ -14,6 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "SummaryView.h"
+#include "Support.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Format.h"
 
 namespace mca {
@@ -22,19 +24,83 @@ namespace mca {
 
 using namespace llvm;
 
+void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
+  // We are only interested in the "instruction dispatched" events generated by
+  // the dispatch stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  if (Event.IR.getSourceIndex() >= Source.size())
+    return;
+
+  // Update the cumulative number of resource cycles based on the processor
+  // resource usage information available from the instruction descriptor. We need to
+  // compute the cumulative number of resource cycles for every processor
+  // resource which is consumed by an instruction of the block.
+  const Instruction &Inst = *Event.IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+  NumMicroOps += Desc.NumMicroOps;
+  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
+    if (!RU.second.size())
+      continue;
+
+    assert(RU.second.NumUnits && "Expected more than one unit used!");
+    if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
+      ProcResourceUsage[RU.first] = RU.second.size();
+      continue;
+    }
+
+    ProcResourceUsage[RU.first] += RU.second.size();
+  }
+}
+
+double SummaryView::getBlockRThroughput() const {
+  assert(NumMicroOps && "Expected at least one micro opcode!");
+
+  SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
+  computeProcResourceMasks(SM, Masks);
+
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distributed, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    uint64_t Mask = Masks[I];
+    const auto It = ProcResourceUsage.find_as(Mask);
+    if (It != ProcResourceUsage.end()) {
+      const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+      unsigned NumUnits = MCDesc.NumUnits;
+      double Throughput = static_cast<double>(It->second) / NumUnits;
+      Max = std::max(Max, Throughput);
+    }
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  -  (#uOps / DispatchWidth)
+  //  -  (#units / resource cycles) for every consumed processor resource.
+  return Max;
+}
+
 void SummaryView::printView(raw_ostream &OS) const {
   unsigned Iterations = Source.getNumIterations();
   unsigned Instructions = Source.size();
   unsigned TotalInstructions = Instructions * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
+  double BlockRThroughput = getBlockRThroughput();
 
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
-  TempStream << "Iterations:     " << Iterations;
-  TempStream << "\nInstructions:   " << TotalInstructions;
-  TempStream << "\nTotal Cycles:   " << TotalCycles;
-  TempStream << "\nDispatch Width: " << DispatchWidth;
-  TempStream << "\nIPC:            " << format("%.2f", IPC) << '\n';
+  TempStream << "Iterations:        " << Iterations;
+  TempStream << "\nInstructions:      " << TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << TotalCycles;
+  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  TempStream << "\nIPC:               " << format("%.2f", IPC);
+  TempStream << "\nBlock RThroughput: " << format("%.1f", BlockRThroughput)
+             << '\n';
   TempStream.flush();
   OS << Buffer;
 }
diff --git a/llvm/tools/llvm-mca/SummaryView.h b/llvm/tools/llvm-mca/SummaryView.h
index 0484057fb10..fe8a5e20f9d 100644
--- a/llvm/tools/llvm-mca/SummaryView.h
+++ b/llvm/tools/llvm-mca/SummaryView.h
@@ -14,12 +14,12 @@
 /// performance throughput. Below is an example of summary view:
 ///
 ///
-/// Iterations:     300
-/// Instructions:   900
-/// Total Cycles:   610
-/// Dispatch Width: 2
-/// IPC:            1.48
-///
+/// Iterations:        300
+/// Instructions:      900
+/// Total Cycles:      610
+/// Dispatch Width:    2
+/// IPC:               1.48
+/// Block RThroughput: 2.0
 ///
 /// The summary view collects a few performance numbers. The two main
 /// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle).
@@ -31,22 +31,41 @@
 
 #include "SourceMgr.h"
 #include "View.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace mca {
 
 /// A view that collects and prints a few performance numbers.
 class SummaryView : public View {
+  const llvm::MCSchedModel &SM;
   const SourceMgr &Source;
   const unsigned DispatchWidth;
   unsigned TotalCycles;
+  // The total number of micro opcodes contributed by a block of instructions.
+  unsigned NumMicroOps;
+  // For each processor resource, this map stores the cumulative number of
+  // resource cycles consumed by a block of instructions. The resource mask ID
+  // is used as the key value to access elements of this map.
+  llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage;
+
+  // Compute the reciprocal throughput for the analyzed code block.
+  // The reciprocal block throughput is computed as the MAX between:
+  //   - NumMicroOps / DispatchWidth
+  //   - Total Resource Cycles / #Units   (for every resource consumed).
+  double getBlockRThroughput() const;
 
 public:
-  SummaryView(const SourceMgr &S, unsigned Width)
-      : Source(S), DispatchWidth(Width), TotalCycles(0) {}
+  SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+              unsigned Width)
+      : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
+        NumMicroOps(0) {}
 
   void onCycleEnd() override { ++TotalCycles; }
 
+  void onInstructionEvent(const HWInstructionEvent &Event) override;
+
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 1e93a7fdcdb..925584b0193 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -495,7 +495,7 @@ int main(int argc, char **argv) {
                    LoadQueueSize, StoreQueueSize, AssumeNoAlias);
     mca::BackendPrinter Printer(B);
 
-    Printer.addView(llvm::make_unique<mca::SummaryView>(S, Width));
+    Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width));
     if (PrintInstructionInfoView)
       Printer.addView(
           llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	2018-05-23 15:59:27 +0000
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	2018-05-23 15:59:27 +0000
commit	3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7 (patch)
tree	41ae64000d1d49ec080e8ecbf4dd14e74ac8071e /llvm/tools
parent	feb3146d4b3e7306be6611532f1219859bbef54e (diff)
download	bcm5719-llvm-3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7.tar.gz bcm5719-llvm-3fc20c9c7f013cd7ad274a8c4d7f99fde98c65a7.zip