summaryrefslogtreecommitdiffstats
path: root/llvm/tools
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/tools')
-rw-r--r--llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp64
-rw-r--r--llvm/tools/llvm-mca/Views/SchedulerStatistics.h12
-rw-r--r--llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h52
-rw-r--r--llvm/tools/llvm-mca/lib/Context.cpp4
-rw-r--r--llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp17
-rw-r--r--llvm/tools/llvm-mca/llvm-mca.cpp4
6 files changed, 137 insertions, 16 deletions
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
index edd6056c1e8..670f90127f1 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -19,29 +19,83 @@
namespace llvm {
namespace mca {
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+ : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+ NumCycles(0), MostRecentLoadDispatched(~0U),
+ MostRecentStoreDispatched(~0U),
+ IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+ Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+ if (SM.hasExtraProcessorInfo()) {
+ const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+ LQResourceID = EPI.LoadQueueID;
+ SQResourceID = EPI.StoreQueueID;
+ }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
if (Event.Type == HWInstructionEvent::Issued)
++NumIssued;
+ else if (Event.Type == HWInstructionEvent::Dispatched) {
+ const Instruction &Inst = *Event.IR.getInstruction();
+ const unsigned Index = Event.IR.getSourceIndex();
+ if (LQResourceID && Inst.getDesc().MayLoad &&
+ MostRecentLoadDispatched != Index) {
+ Usage[LQResourceID].SlotsInUse++;
+ MostRecentLoadDispatched = Index;
+ }
+ if (SQResourceID && Inst.getDesc().MayStore &&
+ MostRecentStoreDispatched != Index) {
+ Usage[SQResourceID].SlotsInUse++;
+ MostRecentStoreDispatched = Index;
+ }
+ } else if (Event.Type == HWInstructionEvent::Executed) {
+ const Instruction &Inst = *Event.IR.getInstruction();
+ if (LQResourceID && Inst.getDesc().MayLoad) {
+ assert(Usage[LQResourceID].SlotsInUse);
+ Usage[LQResourceID].SlotsInUse--;
+ }
+ if (SQResourceID && Inst.getDesc().MayStore) {
+ assert(Usage[SQResourceID].SlotsInUse);
+ Usage[SQResourceID].SlotsInUse--;
+ }
+ }
}
void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
ArrayRef<unsigned> Buffers) {
for (const unsigned Buffer : Buffers) {
- BufferUsage &BU = Usage[Buffer];
- BU.SlotsInUse++;
- BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+ if (Buffer == LQResourceID || Buffer == SQResourceID)
+ continue;
+ Usage[Buffer].SlotsInUse++;
}
}
void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
ArrayRef<unsigned> Buffers) {
- for (const unsigned Buffer : Buffers)
+ for (const unsigned Buffer : Buffers) {
+ if (Buffer == LQResourceID || Buffer == SQResourceID)
+ continue;
Usage[Buffer].SlotsInUse--;
+ }
}
void SchedulerStatistics::updateHistograms() {
- for (BufferUsage &BU : Usage)
+ for (BufferUsage &BU : Usage) {
BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+ BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+ }
+
IssuedPerCycle[NumIssued]++;
NumIssued = 0;
}
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 56dd3af1912..d99a395a726 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -47,9 +47,15 @@ namespace mca {
class SchedulerStatistics final : public View {
const llvm::MCSchedModel &SM;
+ unsigned LQResourceID;
+ unsigned SQResourceID;
+
unsigned NumIssued;
unsigned NumCycles;
+ unsigned MostRecentLoadDispatched;
+ unsigned MostRecentStoreDispatched;
+
// Tracks the usage of a scheduler's queue.
struct BufferUsage {
unsigned SlotsInUse;
@@ -65,11 +71,7 @@ class SchedulerStatistics final : public View {
void printSchedulerUsage(llvm::raw_ostream &OS) const;
public:
- SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
- : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
- IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
- Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
-
+ SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
void onEvent(const HWInstructionEvent &Event) override;
void onCycleBegin() override { NumCycles++; }
void onCycleEnd() override { updateHistograms(); }
diff --git a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index bfe3b01c4de..f8c0722b540 100644
--- a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -18,6 +18,7 @@
#include "HardwareUnits/HardwareUnit.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/MC/MCSchedule.h"
namespace llvm {
namespace mca {
@@ -99,6 +100,44 @@ class LSUnit : public HardwareUnit {
// If true, loads will never alias with stores. This is the default.
bool NoAlias;
+ // When a `MayLoad` instruction is dispatched to the schedulers for execution,
+ // the LSUnit reserves an entry in the `LoadQueue` for it.
+ //
+ // LoadQueue keeps track of all the loads that are in-flight. A load
+ // instruction is eventually removed from the LoadQueue when it reaches
+ // completion stage. That means, a load leaves the queue whe it is 'executed',
+ // and its value can be forwarded on the data path to outside units.
+ //
+ // This class doesn't know about the latency of a load instruction. So, it
+ // conservatively/pessimistically assumes that the latency of a load opcode
+ // matches the instruction latency.
+ //
+ // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
+ // and load/store conflicts, the latency of a load is determined by the depth
+ // of the load pipeline. So, we could use field `LoadLatency` in the
+ // MCSchedModel to model that latency.
+ // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
+ // L1D, and it usually already accounts for any extra latency due to data
+ // forwarding.
+ // When doing throughput analysis, `LoadLatency` is likely to
+ // be a better predictor of load latency than instruction latency. This is
+ // particularly true when simulating code with temporal/spatial locality of
+ // memory accesses.
+ // Using `LoadLatency` (instead of the instruction latency) is also expected
+ // to improve the load queue allocation for long latency instructions with
+ // folded memory operands (See PR39829).
+ //
+ // FIXME: On some processors, load/store operations are split into multiple
+ // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
+ // not 256-bit data types. So, a 256-bit load is effectively split into two
+ // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
+ // simplicity, this class optimistically assumes that a load instruction only
+ // consumes one entry in the LoadQueue. Similarly, store instructions only
+ // consume a single entry in the StoreQueue.
+ // In future, we should reassess the quality of this design, and consider
+ // alternative approaches that let instructions specify the number of
+ // load/store queue entries which they consume at dispatch stage (See
+ // PR39830).
SmallSet<unsigned, 16> LoadQueue;
SmallSet<unsigned, 16> StoreQueue;
@@ -122,8 +161,8 @@ class LSUnit : public HardwareUnit {
bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
public:
- LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
- : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+ LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
+ bool AssumeNoAlias = false);
#ifndef NDEBUG
void dump() const;
@@ -149,6 +188,15 @@ public:
// 5. A load has to wait until an older load barrier is fully executed.
// 6. A store has to wait until an older store barrier is fully executed.
virtual bool isReady(const InstRef &IR) const;
+
+ // Load and store instructions are tracked by their corresponding queues from
+ // dispatch until the "instruction executed" event.
+ // Only when a load instruction reaches the 'Executed' stage, its value
+ // becomes available to the users. At that point, the load no longer needs to
+ // be tracked by the load queue.
+ // FIXME: For simplicity, we optimistically assume a similar behavior for
+ // store instructions. In practice, store operation don't tend to leave the
+ // store queue until they reach the 'Retired' stage (See PR39830).
void onInstructionExecuted(const InstRef &IR);
};
diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp
index 6774a57d29b..d472ae3313a 100644
--- a/llvm/tools/llvm-mca/lib/Context.cpp
+++ b/llvm/tools/llvm-mca/lib/Context.cpp
@@ -35,8 +35,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
// Create the hardware units defining the backend.
auto RCU = llvm::make_unique<RetireControlUnit>(SM);
auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
- auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
- Opts.AssumeNoAlias);
+ auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+ Opts.StoreQueueSize, Opts.AssumeNoAlias);
auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
// Create the pipeline stages.
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index ae020c68432..ed8269167fe 100644
--- a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -22,6 +22,23 @@
namespace llvm {
namespace mca {
+LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
+ bool AssumeNoAlias)
+ : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
+ if (SM.hasExtraProcessorInfo()) {
+ const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+ if (!LQ_Size && EPI.LoadQueueID) {
+ const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
+ LQ_Size = LdQDesc.BufferSize;
+ }
+
+ if (!SQ_Size && EPI.StoreQueueID) {
+ const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
+ SQ_Size = StQDesc.BufferSize;
+ }
+ }
+}
+
#ifndef NDEBUG
void LSUnit::dump() const {
dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 985889677de..a5edbcebc88 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -151,12 +151,12 @@ static cl::opt<bool>
static cl::opt<unsigned>
LoadQueueSize("lqueue",
- cl::desc("Size of the load queue (unbound by default)"),
+ cl::desc("Size of the load queue"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<unsigned>
StoreQueueSize("squeue",
- cl::desc("Size of the store queue (unbound by default)"),
+ cl::desc("Size of the store queue"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<bool>
OpenPOWER on IntegriCloud