diff options
Diffstat (limited to 'llvm/tools')
-rw-r--r-- | llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp | 64 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/Views/SchedulerStatistics.h | 12 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h | 52 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/lib/Context.cpp | 4 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp | 17 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/llvm-mca.cpp | 4 |
6 files changed, 137 insertions, 16 deletions
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp index edd6056c1e8..670f90127f1 100644 --- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp +++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp @@ -19,29 +19,83 @@ namespace llvm { namespace mca { +SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI) + : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0), + NumCycles(0), MostRecentLoadDispatched(~0U), + MostRecentStoreDispatched(~0U), + IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0), + Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + LQResourceID = EPI.LoadQueueID; + SQResourceID = EPI.StoreQueueID; + } +} + +// FIXME: This implementation works under the assumption that load/store queue +// entries are reserved at 'instruction dispatched' stage, and released at +// 'instruction executed' stage. This currently matches the behavior of LSUnit. +// +// The current design minimizes the number of events generated by the +// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method +// `onEvent`. However, it introduces a subtle dependency between this view and +// how the LSUnit works. +// +// In future we should add a new "memory queue" event type, so that we stop +// making assumptions on how LSUnit internally works (See PR39828). void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) { if (Event.Type == HWInstructionEvent::Issued) ++NumIssued; + else if (Event.Type == HWInstructionEvent::Dispatched) { + const Instruction &Inst = *Event.IR.getInstruction(); + const unsigned Index = Event.IR.getSourceIndex(); + if (LQResourceID && Inst.getDesc().MayLoad && + MostRecentLoadDispatched != Index) { + Usage[LQResourceID].SlotsInUse++; + MostRecentLoadDispatched = Index; + } + if (SQResourceID && Inst.getDesc().MayStore && + MostRecentStoreDispatched != Index) { + Usage[SQResourceID].SlotsInUse++; + MostRecentStoreDispatched = Index; + } + } else if (Event.Type == HWInstructionEvent::Executed) { + const Instruction &Inst = *Event.IR.getInstruction(); + if (LQResourceID && Inst.getDesc().MayLoad) { + assert(Usage[LQResourceID].SlotsInUse); + Usage[LQResourceID].SlotsInUse--; + } + if (SQResourceID && Inst.getDesc().MayStore) { + assert(Usage[SQResourceID].SlotsInUse); + Usage[SQResourceID].SlotsInUse--; + } + } } void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */, ArrayRef<unsigned> Buffers) { for (const unsigned Buffer : Buffers) { - BufferUsage &BU = Usage[Buffer]; - BU.SlotsInUse++; - BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse); + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; + Usage[Buffer].SlotsInUse++; } } void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */, ArrayRef<unsigned> Buffers) { - for (const unsigned Buffer : Buffers) + for (const unsigned Buffer : Buffers) { + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; Usage[Buffer].SlotsInUse--; + } } void SchedulerStatistics::updateHistograms() { - for (BufferUsage &BU : Usage) + for (BufferUsage &BU : Usage) { BU.CumulativeNumUsedSlots += BU.SlotsInUse; + BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse); + } + IssuedPerCycle[NumIssued]++; NumIssued = 0; } diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h index 56dd3af1912..d99a395a726 100644 --- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h +++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h @@ -47,9 +47,15 @@ namespace mca { class SchedulerStatistics final : public View { const llvm::MCSchedModel &SM; + unsigned LQResourceID; + unsigned SQResourceID; + unsigned NumIssued; unsigned NumCycles; + unsigned MostRecentLoadDispatched; + unsigned MostRecentStoreDispatched; + // Tracks the usage of a scheduler's queue. struct BufferUsage { unsigned SlotsInUse; @@ -65,11 +71,7 @@ class SchedulerStatistics final : public View { void printSchedulerUsage(llvm::raw_ostream &OS) const; public: - SchedulerStatistics(const llvm::MCSubtargetInfo &STI) - : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0), - IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0), - Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {} - + SchedulerStatistics(const llvm::MCSubtargetInfo &STI); void onEvent(const HWInstructionEvent &Event) override; void onCycleBegin() override { NumCycles++; } void onCycleEnd() override { updateHistograms(); } diff --git a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h index bfe3b01c4de..f8c0722b540 100644 --- a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h +++ b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h @@ -18,6 +18,7 @@ #include "HardwareUnits/HardwareUnit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/MC/MCSchedule.h" namespace llvm { namespace mca { @@ -99,6 +100,44 @@ class LSUnit : public HardwareUnit { // If true, loads will never alias with stores. This is the default. bool NoAlias; + // When a `MayLoad` instruction is dispatched to the schedulers for execution, + // the LSUnit reserves an entry in the `LoadQueue` for it. + // + // LoadQueue keeps track of all the loads that are in-flight. A load + // instruction is eventually removed from the LoadQueue when it reaches + // completion stage. That means, a load leaves the queue whe it is 'executed', + // and its value can be forwarded on the data path to outside units. + // + // This class doesn't know about the latency of a load instruction. So, it + // conservatively/pessimistically assumes that the latency of a load opcode + // matches the instruction latency. + // + // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), + // and load/store conflicts, the latency of a load is determined by the depth + // of the load pipeline. So, we could use field `LoadLatency` in the + // MCSchedModel to model that latency. + // Field `LoadLatency` often matches the so-called 'load-to-use' latency from + // L1D, and it usually already accounts for any extra latency due to data + // forwarding. + // When doing throughput analysis, `LoadLatency` is likely to + // be a better predictor of load latency than instruction latency. This is + // particularly true when simulating code with temporal/spatial locality of + // memory accesses. + // Using `LoadLatency` (instead of the instruction latency) is also expected + // to improve the load queue allocation for long latency instructions with + // folded memory operands (See PR39829). + // + // FIXME: On some processors, load/store operations are split into multiple + // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but + // not 256-bit data types. So, a 256-bit load is effectively split into two + // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For + // simplicity, this class optimistically assumes that a load instruction only + // consumes one entry in the LoadQueue. Similarly, store instructions only + // consume a single entry in the StoreQueue. + // In future, we should reassess the quality of this design, and consider + // alternative approaches that let instructions specify the number of + // load/store queue entries which they consume at dispatch stage (See + // PR39830). SmallSet<unsigned, 16> LoadQueue; SmallSet<unsigned, 16> StoreQueue; @@ -122,8 +161,8 @@ class LSUnit : public HardwareUnit { bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; } public: - LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false) - : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {} + LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0, + bool AssumeNoAlias = false); #ifndef NDEBUG void dump() const; @@ -149,6 +188,15 @@ public: // 5. A load has to wait until an older load barrier is fully executed. // 6. A store has to wait until an older store barrier is fully executed. virtual bool isReady(const InstRef &IR) const; + + // Load and store instructions are tracked by their corresponding queues from + // dispatch until the "instruction executed" event. + // Only when a load instruction reaches the 'Executed' stage, its value + // becomes available to the users. At that point, the load no longer needs to + // be tracked by the load queue. + // FIXME: For simplicity, we optimistically assume a similar behavior for + // store instructions. In practice, store operation don't tend to leave the + // store queue until they reach the 'Retired' stage (See PR39830). void onInstructionExecuted(const InstRef &IR); }; diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp index 6774a57d29b..d472ae3313a 100644 --- a/llvm/tools/llvm-mca/lib/Context.cpp +++ b/llvm/tools/llvm-mca/lib/Context.cpp @@ -35,8 +35,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB, // Create the hardware units defining the backend. auto RCU = llvm::make_unique<RetireControlUnit>(SM); auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize); - auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize, - Opts.AssumeNoAlias); + auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get()); // Create the pipeline stages. diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp index ae020c68432..ed8269167fe 100644 --- a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp +++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp @@ -22,6 +22,23 @@ namespace llvm { namespace mca { +LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, + bool AssumeNoAlias) + : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + if (!LQ_Size && EPI.LoadQueueID) { + const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID); + LQ_Size = LdQDesc.BufferSize; + } + + if (!SQ_Size && EPI.StoreQueueID) { + const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID); + SQ_Size = StQDesc.BufferSize; + } + } +} + #ifndef NDEBUG void LSUnit::dump() const { dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n'; diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp index 985889677de..a5edbcebc88 100644 --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -151,12 +151,12 @@ static cl::opt<bool> static cl::opt<unsigned> LoadQueueSize("lqueue", - cl::desc("Size of the load queue (unbound by default)"), + cl::desc("Size of the load queue"), cl::cat(ToolOptions), cl::init(0)); static cl::opt<unsigned> StoreQueueSize("squeue", - cl::desc("Size of the store queue (unbound by default)"), + cl::desc("Size of the store queue"), cl::cat(ToolOptions), cl::init(0)); static cl::opt<bool> |