6 files changed, 137 insertions, 16 deletions
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
index edd6056c1e8..670f90127f1 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -19,29 +19,83 @@
 namespace llvm {
 namespace mca {
 
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+    : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+      NumCycles(0), MostRecentLoadDispatched(~0U),
+      MostRecentStoreDispatched(~0U),
+      IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+      Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    LQResourceID = EPI.LoadQueueID;
+    SQResourceID = EPI.StoreQueueID;
+  }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
   if (Event.Type == HWInstructionEvent::Issued)
     ++NumIssued;
+  else if (Event.Type == HWInstructionEvent::Dispatched) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    const unsigned Index = Event.IR.getSourceIndex();
+    if (LQResourceID && Inst.getDesc().MayLoad &&
+        MostRecentLoadDispatched != Index) {
+      Usage[LQResourceID].SlotsInUse++;
+      MostRecentLoadDispatched = Index;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore &&
+        MostRecentStoreDispatched != Index) {
+      Usage[SQResourceID].SlotsInUse++;
+      MostRecentStoreDispatched = Index;
+    }
+  } else if (Event.Type == HWInstructionEvent::Executed) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    if (LQResourceID && Inst.getDesc().MayLoad) {
+      assert(Usage[LQResourceID].SlotsInUse);
+      Usage[LQResourceID].SlotsInUse--;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore) {
+      assert(Usage[SQResourceID].SlotsInUse);
+      Usage[SQResourceID].SlotsInUse--;
+    }
+  }
 }
 
 void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
   for (const unsigned Buffer : Buffers) {
-    BufferUsage &BU = Usage[Buffer];
-    BU.SlotsInUse++;
-    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
+    Usage[Buffer].SlotsInUse++;
   }
 }
 
 void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
-  for (const unsigned Buffer : Buffers)
+  for (const unsigned Buffer : Buffers) {
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
     Usage[Buffer].SlotsInUse--;
+  }
 }
 
 void SchedulerStatistics::updateHistograms() {
-  for (BufferUsage &BU : Usage)
+  for (BufferUsage &BU : Usage) {
     BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+  }
+
   IssuedPerCycle[NumIssued]++;
   NumIssued = 0;
 }
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 56dd3af1912..d99a395a726 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -47,9 +47,15 @@ namespace mca {
 
 class SchedulerStatistics final : public View {
   const llvm::MCSchedModel &SM;
+  unsigned LQResourceID;
+  unsigned SQResourceID;
+
   unsigned NumIssued;
   unsigned NumCycles;
 
+  unsigned MostRecentLoadDispatched;
+  unsigned MostRecentStoreDispatched;
+
   // Tracks the usage of a scheduler's queue.
   struct BufferUsage {
     unsigned SlotsInUse;
@@ -65,11 +71,7 @@ class SchedulerStatistics final : public View {
   void printSchedulerUsage(llvm::raw_ostream &OS) const;
 
 public:
-  SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
-      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
-        IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
-        Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
-
+  SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
   void onEvent(const HWInstructionEvent &Event) override;
   void onCycleBegin() override { NumCycles++; }
   void onCycleEnd() override { updateHistograms(); }
diff --git a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index bfe3b01c4de..f8c0722b540 100644
--- a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -18,6 +18,7 @@
 
 #include "HardwareUnits/HardwareUnit.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/MC/MCSchedule.h"
 
 namespace llvm {
 namespace mca {
@@ -99,6 +100,44 @@ class LSUnit : public HardwareUnit {
   // If true, loads will never alias with stores. This is the default.
   bool NoAlias;
 
+  // When a `MayLoad` instruction is dispatched to the schedulers for execution,
+  // the LSUnit reserves an entry in the `LoadQueue` for it.
+  //
+  // LoadQueue keeps track of all the loads that are in-flight. A load
+  // instruction is eventually removed from the LoadQueue when it reaches
+  // completion stage. That means, a load leaves the queue whe it is 'executed',
+  // and its value can be forwarded on the data path to outside units.
+  //
+  // This class doesn't know about the latency of a load instruction. So, it
+  // conservatively/pessimistically assumes that the latency of a load opcode
+  // matches the instruction latency. 
+  //
+  // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
+  // and load/store conflicts, the latency of a load is determined by the depth
+  // of the load pipeline. So, we could use field `LoadLatency` in the
+  // MCSchedModel to model that latency.
+  // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
+  // L1D, and it usually already accounts for any extra latency due to data
+  // forwarding.
+  // When doing throughput analysis, `LoadLatency` is likely to
+  // be a better predictor of load latency than instruction latency. This is
+  // particularly true when simulating code with temporal/spatial locality of
+  // memory accesses.
+  // Using `LoadLatency` (instead of the instruction latency) is also expected
+  // to improve the load queue allocation for long latency instructions with
+  // folded memory operands (See PR39829).
+  //
+  // FIXME: On some processors, load/store operations are split into multiple
+  // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
+  // not 256-bit data types. So, a 256-bit load is effectively split into two
+  // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
+  // simplicity, this class optimistically assumes that a load instruction only
+  // consumes one entry in the LoadQueue.  Similarly, store instructions only
+  // consume a single entry in the StoreQueue.
+  // In future, we should reassess the quality of this design, and consider
+  // alternative approaches that let instructions specify the number of
+  // load/store queue entries which they consume at dispatch stage (See
+  // PR39830).
   SmallSet<unsigned, 16> LoadQueue;
   SmallSet<unsigned, 16> StoreQueue;
 
@@ -122,8 +161,8 @@ class LSUnit : public HardwareUnit {
   bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
 
 public:
-  LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
-      : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+  LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
+         bool AssumeNoAlias = false);
 
 #ifndef NDEBUG
   void dump() const;
@@ -149,6 +188,15 @@ public:
   // 5. A load has to wait until an older load barrier is fully executed.
   // 6. A store has to wait until an older store barrier is fully executed.
   virtual bool isReady(const InstRef &IR) const;
+
+  // Load and store instructions are tracked by their corresponding queues from
+  // dispatch until the "instruction executed" event.
+  // Only when a load instruction reaches the 'Executed' stage, its value
+  // becomes available to the users. At that point, the load no longer needs to
+  // be tracked by the load queue.
+  // FIXME: For simplicity, we optimistically assume a similar behavior for
+  // store instructions.  In practice, store operation don't tend to leave the
+  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR);
 };
 
diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp
index 6774a57d29b..d472ae3313a 100644
--- a/llvm/tools/llvm-mca/lib/Context.cpp
+++ b/llvm/tools/llvm-mca/lib/Context.cpp
@@ -35,8 +35,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   // Create the hardware units defining the backend.
   auto RCU = llvm::make_unique<RetireControlUnit>(SM);
   auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
-  auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
-                                       Opts.AssumeNoAlias);
+  auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+                                       Opts.StoreQueueSize, Opts.AssumeNoAlias);
   auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
 
   // Create the pipeline stages.
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index ae020c68432..ed8269167fe 100644
--- a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -22,6 +22,23 @@
 namespace llvm {
 namespace mca {
 
+LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
+               bool AssumeNoAlias)
+    : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (!LQ_Size && EPI.LoadQueueID) {
+      const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
+      LQ_Size = LdQDesc.BufferSize;
+    }
+
+    if (!SQ_Size && EPI.StoreQueueID) {
+      const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
+      SQ_Size = StQDesc.BufferSize;
+    }
+  }
+}
+
 #ifndef NDEBUG
 void LSUnit::dump() const {
   dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 985889677de..a5edbcebc88 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -151,12 +151,12 @@ static cl::opt<bool>
 
 static cl::opt<unsigned>
     LoadQueueSize("lqueue",
-                  cl::desc("Size of the load queue (unbound by default)"),
+                  cl::desc("Size of the load queue"),
                   cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<unsigned>
     StoreQueueSize("squeue",
-                   cl::desc("Size of the store queue (unbound by default)"),
+                   cl::desc("Size of the store queue"),
                    cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<bool>