llvm/tools/llvm-mca/SummaryView.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

//===--------------------- SummaryView.cpp -------------------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements the functionalities used by the SummaryView to print
/// the report information.
///
//===----------------------------------------------------------------------===//

#include "SummaryView.h"
#include "Support.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Format.h"

namespace mca {

#define DEBUG_TYPE "llvm-mca"

using namespace llvm;

void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
  // We are only interested in the "instruction dispatched" events generated by
  // the dispatch stage for instructions that are part of iteration #0.
  if (Event.Type != HWInstructionEvent::Dispatched)
    return;

  if (Event.IR.getSourceIndex() >= Source.size())
    return;

  // Update the cumulative number of resource cycles based on the processor
  // resource usage information available from the instruction descriptor. We need to
  // compute the cumulative number of resource cycles for every processor
  // resource which is consumed by an instruction of the block.
  const Instruction &Inst = *Event.IR.getInstruction();
  const InstrDesc &Desc = Inst.getDesc();
  NumMicroOps += Desc.NumMicroOps;
  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
    if (!RU.second.size())
      continue;

    assert(RU.second.NumUnits && "Expected more than one unit used!");
    if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
      ProcResourceUsage[RU.first] = RU.second.size();
      continue;
    }

    ProcResourceUsage[RU.first] += RU.second.size();
  }
}

double SummaryView::getBlockRThroughput() const {
  assert(NumMicroOps && "Expected at least one micro opcode!");

  SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
  computeProcResourceMasks(SM, Masks);

  // The block throughput is bounded from above by the hardware dispatch
  // throughput. That is because the DispatchWidth is an upper bound on the
  // number of opcodes that can be part of a single dispatch group.
  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;

  // The block throughput is also limited by the amount of hardware parallelism.
  // The number of available resource units affects the resource pressure
  // distributed, as well as how many blocks can be executed every cycle.
  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
    uint64_t Mask = Masks[I];
    const auto It = ProcResourceUsage.find_as(Mask);
    if (It != ProcResourceUsage.end()) {
      const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
      unsigned NumUnits = MCDesc.NumUnits;
      double Throughput = static_cast<double>(It->second) / NumUnits;
      Max = std::max(Max, Throughput);
    }
  }

  // The block reciprocal throughput is computed as the MAX of:
  //  -  (#uOps / DispatchWidth)
  //  -  (#units / resource cycles) for every consumed processor resource.
  return Max;
}

void SummaryView::printView(raw_ostream &OS) const {
  unsigned Iterations = Source.getNumIterations();
  unsigned Instructions = Source.size();
  unsigned TotalInstructions = Instructions * Iterations;
  double IPC = (double)TotalInstructions / TotalCycles;
  double BlockRThroughput = getBlockRThroughput();

  std::string Buffer;
  raw_string_ostream TempStream(Buffer);
  TempStream << "Iterations:        " << Iterations;
  TempStream << "\nInstructions:      " << TotalInstructions;
  TempStream << "\nTotal Cycles:      " << TotalCycles;
  TempStream << "\nDispatch Width:    " << DispatchWidth;
  TempStream << "\nIPC:               " << format("%.2f", IPC);
  TempStream << "\nBlock RThroughput: " << format("%.1f", BlockRThroughput)
             << '\n';
  TempStream.flush();
  OS << Buffer;
}
} // namespace mca.