diff options
Diffstat (limited to 'llvm/tools')
| -rw-r--r-- | llvm/tools/llvm-mca/SummaryView.cpp | 76 | ||||
| -rw-r--r-- | llvm/tools/llvm-mca/SummaryView.h | 35 | ||||
| -rw-r--r-- | llvm/tools/llvm-mca/llvm-mca.cpp | 2 |
3 files changed, 99 insertions, 14 deletions
diff --git a/llvm/tools/llvm-mca/SummaryView.cpp b/llvm/tools/llvm-mca/SummaryView.cpp index 511727bc750..9b6e1d9b183 100644 --- a/llvm/tools/llvm-mca/SummaryView.cpp +++ b/llvm/tools/llvm-mca/SummaryView.cpp @@ -14,6 +14,8 @@ //===----------------------------------------------------------------------===// #include "SummaryView.h" +#include "Support.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/Format.h" namespace mca { @@ -22,19 +24,83 @@ namespace mca { using namespace llvm; +void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) { + // We are only interested in the "instruction dispatched" events generated by + // the dispatch stage for instructions that are part of iteration #0. + if (Event.Type != HWInstructionEvent::Dispatched) + return; + + if (Event.IR.getSourceIndex() >= Source.size()) + return; + + // Update the cumulative number of resource cycles based on the processor + // resource usage information available from the instruction descriptor. We need to + // compute the cumulative number of resource cycles for every processor + // resource which is consumed by an instruction of the block. + const Instruction &Inst = *Event.IR.getInstruction(); + const InstrDesc &Desc = Inst.getDesc(); + NumMicroOps += Desc.NumMicroOps; + for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) { + if (!RU.second.size()) + continue; + + assert(RU.second.NumUnits && "Expected more than one unit used!"); + if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) { + ProcResourceUsage[RU.first] = RU.second.size(); + continue; + } + + ProcResourceUsage[RU.first] += RU.second.size(); + } +} + +double SummaryView::getBlockRThroughput() const { + assert(NumMicroOps && "Expected at least one micro opcode!"); + + SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds()); + computeProcResourceMasks(SM, Masks); + + // The block throughput is bounded from above by the hardware dispatch + // throughput. That is because the DispatchWidth is an upper bound on the + // number of opcodes that can be part of a single dispatch group. + double Max = static_cast<double>(NumMicroOps) / DispatchWidth; + + // The block throughput is also limited by the amount of hardware parallelism. + // The number of available resource units affects the resource pressure + // distributed, as well as how many blocks can be executed every cycle. + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + uint64_t Mask = Masks[I]; + const auto It = ProcResourceUsage.find_as(Mask); + if (It != ProcResourceUsage.end()) { + const MCProcResourceDesc &MCDesc = *SM.getProcResource(I); + unsigned NumUnits = MCDesc.NumUnits; + double Throughput = static_cast<double>(It->second) / NumUnits; + Max = std::max(Max, Throughput); + } + } + + // The block reciprocal throughput is computed as the MAX of: + // - (#uOps / DispatchWidth) + // - (#units / resource cycles) for every consumed processor resource. + return Max; +} + void SummaryView::printView(raw_ostream &OS) const { unsigned Iterations = Source.getNumIterations(); unsigned Instructions = Source.size(); unsigned TotalInstructions = Instructions * Iterations; double IPC = (double)TotalInstructions / TotalCycles; + double BlockRThroughput = getBlockRThroughput(); std::string Buffer; raw_string_ostream TempStream(Buffer); - TempStream << "Iterations: " << Iterations; - TempStream << "\nInstructions: " << TotalInstructions; - TempStream << "\nTotal Cycles: " << TotalCycles; - TempStream << "\nDispatch Width: " << DispatchWidth; - TempStream << "\nIPC: " << format("%.2f", IPC) << '\n'; + TempStream << "Iterations: " << Iterations; + TempStream << "\nInstructions: " << TotalInstructions; + TempStream << "\nTotal Cycles: " << TotalCycles; + TempStream << "\nDispatch Width: " << DispatchWidth; + TempStream << "\nIPC: " << format("%.2f", IPC); + TempStream << "\nBlock RThroughput: " << format("%.1f", BlockRThroughput) + << '\n'; TempStream.flush(); OS << Buffer; } diff --git a/llvm/tools/llvm-mca/SummaryView.h b/llvm/tools/llvm-mca/SummaryView.h index 0484057fb10..fe8a5e20f9d 100644 --- a/llvm/tools/llvm-mca/SummaryView.h +++ b/llvm/tools/llvm-mca/SummaryView.h @@ -14,12 +14,12 @@ /// performance throughput. Below is an example of summary view: /// /// -/// Iterations: 300 -/// Instructions: 900 -/// Total Cycles: 610 -/// Dispatch Width: 2 -/// IPC: 1.48 -/// +/// Iterations: 300 +/// Instructions: 900 +/// Total Cycles: 610 +/// Dispatch Width: 2 +/// IPC: 1.48 +/// Block RThroughput: 2.0 /// /// The summary view collects a few performance numbers. The two main /// performance indicators are 'Total Cycles' and IPC (Instructions Per Cycle). @@ -31,22 +31,41 @@ #include "SourceMgr.h" #include "View.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Support/raw_ostream.h" namespace mca { /// A view that collects and prints a few performance numbers. class SummaryView : public View { + const llvm::MCSchedModel &SM; const SourceMgr &Source; const unsigned DispatchWidth; unsigned TotalCycles; + // The total number of micro opcodes contributed by a block of instructions. + unsigned NumMicroOps; + // For each processor resource, this map stores the cumulative number of + // resource cycles consumed by a block of instructions. The resource mask ID + // is used as the key value to access elements of this map. + llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage; + + // Compute the reciprocal throughput for the analyzed code block. + // The reciprocal block throughput is computed as the MAX between: + // - NumMicroOps / DispatchWidth + // - Total Resource Cycles / #Units (for every resource consumed). + double getBlockRThroughput() const; public: - SummaryView(const SourceMgr &S, unsigned Width) - : Source(S), DispatchWidth(Width), TotalCycles(0) {} + SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S, + unsigned Width) + : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0), + NumMicroOps(0) {} void onCycleEnd() override { ++TotalCycles; } + void onInstructionEvent(const HWInstructionEvent &Event) override; + void printView(llvm::raw_ostream &OS) const override; }; } // namespace mca diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp index 1e93a7fdcdb..925584b0193 100644 --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -495,7 +495,7 @@ int main(int argc, char **argv) { LoadQueueSize, StoreQueueSize, AssumeNoAlias); mca::BackendPrinter Printer(B); - Printer.addView(llvm::make_unique<mca::SummaryView>(S, Width)); + Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width)); if (PrintInstructionInfoView) Printer.addView( llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP)); |

