summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/MC/MCSchedule.h2
-rw-r--r--llvm/include/llvm/Target/TargetSchedule.td10
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBdVer2.td4
-rw-r--r--llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s90
-rw-r--r--llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s88
-rw-r--r--llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp64
-rw-r--r--llvm/tools/llvm-mca/Views/SchedulerStatistics.h12
-rw-r--r--llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h52
-rw-r--r--llvm/tools/llvm-mca/lib/Context.cpp4
-rw-r--r--llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp17
-rw-r--r--llvm/tools/llvm-mca/llvm-mca.cpp4
-rw-r--r--llvm/utils/TableGen/CodeGenSchedule.cpp32
-rw-r--r--llvm/utils/TableGen/CodeGenSchedule.h11
-rw-r--r--llvm/utils/TableGen/SubtargetEmitter.cpp29
14 files changed, 312 insertions, 107 deletions
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index 41305296b00..689ac73cbdd 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -183,6 +183,8 @@ struct MCExtraProcessorInfo {
unsigned NumRegisterFiles;
const MCRegisterCostEntry *RegisterCostTable;
unsigned NumRegisterCostEntries;
+ unsigned LoadQueueID;
+ unsigned StoreQueueID;
};
/// Machine model for scheduling, bundling, and heuristics.
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 3088771833c..808e183f5a5 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -561,3 +561,13 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
int MaxRetirePerCycle = retirePerCycle;
SchedMachineModel SchedModel = ?;
}
+
+// Base class for Load/StoreQueue. It is used to identify processor resources
+// which describe load/store queues in the LS unit.
+class MemoryQueue<ProcResource PR> {
+ ProcResource QueueDescriptor = PR;
+ SchedMachineModel SchedModel = ?;
+}
+
+class LoadQueue<ProcResource LDQueue> : MemoryQueue<LDQueue>;
+class StoreQueue<ProcResource STQueue> : MemoryQueue<STQueue>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 1a066b30f89..5798e1b2671 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -136,12 +136,16 @@ def PdLoad : ProcResource<2> {
let BufferSize = 40;
}
+def PdLoadQueue : LoadQueue<PdLoad>;
+
let Super = PdAGLU01 in
def PdStore : ProcResource<1> {
// For Piledriver, the store queue is 24 entries deep.
let BufferSize = 24;
}
+def PdStoreQueue : StoreQueue<PdStore>;
+
//===----------------------------------------------------------------------===//
// Integer Execution Units
//
diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
index 00b03afbf0d..9c45ce63fa4 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
@@ -79,16 +79,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -102,9 +102,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
# CHECK-NEXT: PdFPU 0 0 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -192,16 +192,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -215,9 +215,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
# CHECK-NEXT: PdFPU 0 0 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -305,16 +305,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -328,9 +328,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
# CHECK-NEXT: PdFPU 0 0 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -418,16 +418,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -441,9 +441,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
# CHECK-NEXT: PdFPU 0 0 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -531,16 +531,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -554,9 +554,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
-# CHECK-NEXT: PdFPU 35 40 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
+# CHECK-NEXT: PdFPU 27 30 64
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -644,16 +644,16 @@ vmovaps (%rbx), %ymm3
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%)
-# CHECK-NEXT: LQ - Load queue full: 0
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
+# CHECK-NEXT: LQ - Load queue full: 171 (82.6%)
# CHECK-NEXT: SQ - Store queue full: 0
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (12.6%)
-# CHECK-NEXT: 2, 162 (78.3%)
-# CHECK-NEXT: 4, 19 (9.2%)
+# CHECK-NEXT: 0, 21 (10.1%)
+# CHECK-NEXT: 2, 172 (83.1%)
+# CHECK-NEXT: 4, 14 (6.8%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
# CHECK-NEXT: [# issued], [# cycles]
@@ -667,9 +667,9 @@ vmovaps (%rbx), %ymm3
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 35 40 40
-# CHECK-NEXT: PdFPU 35 40 64
-# CHECK-NEXT: PdLoad 35 40 40
+# CHECK-NEXT: PdEX 27 30 40
+# CHECK-NEXT: PdFPU 27 30 64
+# CHECK-NEXT: PdLoad 36 40 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
@@ -781,7 +781,7 @@ vmovaps (%rbx), %ymm3
# CHECK: [1] [2] [3] [4]
# CHECK-NEXT: PdEX 1 2 40
# CHECK-NEXT: PdFPU 1 2 64
-# CHECK-NEXT: PdLoad 1 2 40
+# CHECK-NEXT: PdLoad 11 12 40
# CHECK-NEXT: PdStore 0 0 24
# CHECK: Resources:
diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
index 76892986055..67f13c3ccdd 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
@@ -79,16 +79,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (6.5%)
-# CHECK-NEXT: 1, 369 (91.6%)
-# CHECK-NEXT: 3, 1 (0.2%)
+# CHECK-NEXT: 0, 25 (6.2%)
+# CHECK-NEXT: 1, 370 (91.8%)
+# CHECK-NEXT: 2, 1 (0.2%)
# CHECK-NEXT: 4, 7 (1.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -103,10 +103,10 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 22 24 40
+# CHECK-NEXT: PdEX 22 23 40
# CHECK-NEXT: PdFPU 0 0 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 22 24 24
+# CHECK-NEXT: PdStore 23 24 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
@@ -193,16 +193,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (6.5%)
-# CHECK-NEXT: 1, 369 (91.6%)
-# CHECK-NEXT: 3, 1 (0.2%)
+# CHECK-NEXT: 0, 25 (6.2%)
+# CHECK-NEXT: 1, 370 (91.8%)
+# CHECK-NEXT: 2, 1 (0.2%)
# CHECK-NEXT: 4, 7 (1.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -217,10 +217,10 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 22 24 40
+# CHECK-NEXT: PdEX 22 23 40
# CHECK-NEXT: PdFPU 0 0 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 22 24 24
+# CHECK-NEXT: PdStore 23 24 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
@@ -307,16 +307,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (6.5%)
-# CHECK-NEXT: 1, 369 (91.6%)
-# CHECK-NEXT: 3, 1 (0.2%)
+# CHECK-NEXT: 0, 25 (6.2%)
+# CHECK-NEXT: 1, 370 (91.8%)
+# CHECK-NEXT: 2, 1 (0.2%)
# CHECK-NEXT: 4, 7 (1.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -331,10 +331,10 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 22 24 40
+# CHECK-NEXT: PdEX 22 23 40
# CHECK-NEXT: PdFPU 0 0 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 22 24 24
+# CHECK-NEXT: PdStore 23 24 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
@@ -421,16 +421,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (6.5%)
-# CHECK-NEXT: 1, 369 (91.6%)
-# CHECK-NEXT: 3, 1 (0.2%)
+# CHECK-NEXT: 0, 25 (6.2%)
+# CHECK-NEXT: 1, 370 (91.8%)
+# CHECK-NEXT: 2, 1 (0.2%)
# CHECK-NEXT: 4, 7 (1.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -445,10 +445,10 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 22 24 40
+# CHECK-NEXT: PdEX 22 23 40
# CHECK-NEXT: PdFPU 0 0 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 22 24 24
+# CHECK-NEXT: PdStore 23 24 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
@@ -535,16 +535,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 745 (92.8%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 747 (93.0%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 423 (52.7%)
-# CHECK-NEXT: 1, 373 (46.5%)
-# CHECK-NEXT: 3, 1 (0.1%)
+# CHECK-NEXT: 0, 422 (52.6%)
+# CHECK-NEXT: 1, 374 (46.6%)
+# CHECK-NEXT: 2, 1 (0.1%)
# CHECK-NEXT: 4, 6 (0.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -559,8 +559,8 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 23 24 40
-# CHECK-NEXT: PdFPU 23 24 64
+# CHECK-NEXT: PdEX 22 23 40
+# CHECK-NEXT: PdFPU 22 23 64
# CHECK-NEXT: PdLoad 0 0 40
# CHECK-NEXT: PdStore 23 24 24
@@ -650,16 +650,16 @@ vmovaps %ymm3, (%rbx)
# CHECK: Dynamic Dispatch Stall Cycles:
# CHECK-NEXT: RAT - Register unavailable: 0
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
-# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%)
+# CHECK-NEXT: SCHEDQ - Scheduler full: 0
# CHECK-NEXT: LQ - Load queue full: 0
-# CHECK-NEXT: SQ - Store queue full: 0
+# CHECK-NEXT: SQ - Store queue full: 370 (91.8%)
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
# CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT: 0, 26 (6.5%)
-# CHECK-NEXT: 1, 369 (91.6%)
-# CHECK-NEXT: 3, 1 (0.2%)
+# CHECK-NEXT: 0, 25 (6.2%)
+# CHECK-NEXT: 1, 370 (91.8%)
+# CHECK-NEXT: 2, 1 (0.2%)
# CHECK-NEXT: 4, 7 (1.7%)
# CHECK: Schedulers - number of cycles where we saw N instructions issued:
@@ -674,10 +674,10 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: [4] Total number of buffer entries.
# CHECK: [1] [2] [3] [4]
-# CHECK-NEXT: PdEX 22 24 40
-# CHECK-NEXT: PdFPU 22 24 64
+# CHECK-NEXT: PdEX 22 23 40
+# CHECK-NEXT: PdFPU 22 23 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 22 24 24
+# CHECK-NEXT: PdStore 23 24 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
@@ -789,7 +789,7 @@ vmovaps %ymm3, (%rbx)
# CHECK-NEXT: PdEX 1 1 40
# CHECK-NEXT: PdFPU 1 1 64
# CHECK-NEXT: PdLoad 0 0 40
-# CHECK-NEXT: PdStore 1 1 24
+# CHECK-NEXT: PdStore 2 2 24
# CHECK: Resources:
# CHECK-NEXT: [0.0] - PdAGLU01
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
index edd6056c1e8..670f90127f1 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -19,29 +19,83 @@
namespace llvm {
namespace mca {
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+ : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+ NumCycles(0), MostRecentLoadDispatched(~0U),
+ MostRecentStoreDispatched(~0U),
+ IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+ Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+ if (SM.hasExtraProcessorInfo()) {
+ const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+ LQResourceID = EPI.LoadQueueID;
+ SQResourceID = EPI.StoreQueueID;
+ }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
if (Event.Type == HWInstructionEvent::Issued)
++NumIssued;
+ else if (Event.Type == HWInstructionEvent::Dispatched) {
+ const Instruction &Inst = *Event.IR.getInstruction();
+ const unsigned Index = Event.IR.getSourceIndex();
+ if (LQResourceID && Inst.getDesc().MayLoad &&
+ MostRecentLoadDispatched != Index) {
+ Usage[LQResourceID].SlotsInUse++;
+ MostRecentLoadDispatched = Index;
+ }
+ if (SQResourceID && Inst.getDesc().MayStore &&
+ MostRecentStoreDispatched != Index) {
+ Usage[SQResourceID].SlotsInUse++;
+ MostRecentStoreDispatched = Index;
+ }
+ } else if (Event.Type == HWInstructionEvent::Executed) {
+ const Instruction &Inst = *Event.IR.getInstruction();
+ if (LQResourceID && Inst.getDesc().MayLoad) {
+ assert(Usage[LQResourceID].SlotsInUse);
+ Usage[LQResourceID].SlotsInUse--;
+ }
+ if (SQResourceID && Inst.getDesc().MayStore) {
+ assert(Usage[SQResourceID].SlotsInUse);
+ Usage[SQResourceID].SlotsInUse--;
+ }
+ }
}
void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
ArrayRef<unsigned> Buffers) {
for (const unsigned Buffer : Buffers) {
- BufferUsage &BU = Usage[Buffer];
- BU.SlotsInUse++;
- BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+ if (Buffer == LQResourceID || Buffer == SQResourceID)
+ continue;
+ Usage[Buffer].SlotsInUse++;
}
}
void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
ArrayRef<unsigned> Buffers) {
- for (const unsigned Buffer : Buffers)
+ for (const unsigned Buffer : Buffers) {
+ if (Buffer == LQResourceID || Buffer == SQResourceID)
+ continue;
Usage[Buffer].SlotsInUse--;
+ }
}
void SchedulerStatistics::updateHistograms() {
- for (BufferUsage &BU : Usage)
+ for (BufferUsage &BU : Usage) {
BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+ BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+ }
+
IssuedPerCycle[NumIssued]++;
NumIssued = 0;
}
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 56dd3af1912..d99a395a726 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -47,9 +47,15 @@ namespace mca {
class SchedulerStatistics final : public View {
const llvm::MCSchedModel &SM;
+ unsigned LQResourceID;
+ unsigned SQResourceID;
+
unsigned NumIssued;
unsigned NumCycles;
+ unsigned MostRecentLoadDispatched;
+ unsigned MostRecentStoreDispatched;
+
// Tracks the usage of a scheduler's queue.
struct BufferUsage {
unsigned SlotsInUse;
@@ -65,11 +71,7 @@ class SchedulerStatistics final : public View {
void printSchedulerUsage(llvm::raw_ostream &OS) const;
public:
- SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
- : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
- IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
- Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
-
+ SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
void onEvent(const HWInstructionEvent &Event) override;
void onCycleBegin() override { NumCycles++; }
void onCycleEnd() override { updateHistograms(); }
diff --git a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index bfe3b01c4de..f8c0722b540 100644
--- a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -18,6 +18,7 @@
#include "HardwareUnits/HardwareUnit.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/MC/MCSchedule.h"
namespace llvm {
namespace mca {
@@ -99,6 +100,44 @@ class LSUnit : public HardwareUnit {
// If true, loads will never alias with stores. This is the default.
bool NoAlias;
+ // When a `MayLoad` instruction is dispatched to the schedulers for execution,
+ // the LSUnit reserves an entry in the `LoadQueue` for it.
+ //
+ // LoadQueue keeps track of all the loads that are in-flight. A load
+ // instruction is eventually removed from the LoadQueue when it reaches
+ // completion stage. That means, a load leaves the queue whe it is 'executed',
+ // and its value can be forwarded on the data path to outside units.
+ //
+ // This class doesn't know about the latency of a load instruction. So, it
+ // conservatively/pessimistically assumes that the latency of a load opcode
+ // matches the instruction latency.
+ //
+ // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
+ // and load/store conflicts, the latency of a load is determined by the depth
+ // of the load pipeline. So, we could use field `LoadLatency` in the
+ // MCSchedModel to model that latency.
+ // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
+ // L1D, and it usually already accounts for any extra latency due to data
+ // forwarding.
+ // When doing throughput analysis, `LoadLatency` is likely to
+ // be a better predictor of load latency than instruction latency. This is
+ // particularly true when simulating code with temporal/spatial locality of
+ // memory accesses.
+ // Using `LoadLatency` (instead of the instruction latency) is also expected
+ // to improve the load queue allocation for long latency instructions with
+ // folded memory operands (See PR39829).
+ //
+ // FIXME: On some processors, load/store operations are split into multiple
+ // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
+ // not 256-bit data types. So, a 256-bit load is effectively split into two
+ // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
+ // simplicity, this class optimistically assumes that a load instruction only
+ // consumes one entry in the LoadQueue. Similarly, store instructions only
+ // consume a single entry in the StoreQueue.
+ // In future, we should reassess the quality of this design, and consider
+ // alternative approaches that let instructions specify the number of
+ // load/store queue entries which they consume at dispatch stage (See
+ // PR39830).
SmallSet<unsigned, 16> LoadQueue;
SmallSet<unsigned, 16> StoreQueue;
@@ -122,8 +161,8 @@ class LSUnit : public HardwareUnit {
bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
public:
- LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
- : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+ LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
+ bool AssumeNoAlias = false);
#ifndef NDEBUG
void dump() const;
@@ -149,6 +188,15 @@ public:
// 5. A load has to wait until an older load barrier is fully executed.
// 6. A store has to wait until an older store barrier is fully executed.
virtual bool isReady(const InstRef &IR) const;
+
+ // Load and store instructions are tracked by their corresponding queues from
+ // dispatch until the "instruction executed" event.
+ // Only when a load instruction reaches the 'Executed' stage, its value
+ // becomes available to the users. At that point, the load no longer needs to
+ // be tracked by the load queue.
+ // FIXME: For simplicity, we optimistically assume a similar behavior for
+ // store instructions. In practice, store operation don't tend to leave the
+ // store queue until they reach the 'Retired' stage (See PR39830).
void onInstructionExecuted(const InstRef &IR);
};
diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp
index 6774a57d29b..d472ae3313a 100644
--- a/llvm/tools/llvm-mca/lib/Context.cpp
+++ b/llvm/tools/llvm-mca/lib/Context.cpp
@@ -35,8 +35,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
// Create the hardware units defining the backend.
auto RCU = llvm::make_unique<RetireControlUnit>(SM);
auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
- auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
- Opts.AssumeNoAlias);
+ auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+ Opts.StoreQueueSize, Opts.AssumeNoAlias);
auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
// Create the pipeline stages.
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index ae020c68432..ed8269167fe 100644
--- a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -22,6 +22,23 @@
namespace llvm {
namespace mca {
+LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
+ bool AssumeNoAlias)
+ : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
+ if (SM.hasExtraProcessorInfo()) {
+ const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+ if (!LQ_Size && EPI.LoadQueueID) {
+ const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
+ LQ_Size = LdQDesc.BufferSize;
+ }
+
+ if (!SQ_Size && EPI.StoreQueueID) {
+ const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
+ SQ_Size = StQDesc.BufferSize;
+ }
+ }
+}
+
#ifndef NDEBUG
void LSUnit::dump() const {
dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 985889677de..a5edbcebc88 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -151,12 +151,12 @@ static cl::opt<bool>
static cl::opt<unsigned>
LoadQueueSize("lqueue",
- cl::desc("Size of the load queue (unbound by default)"),
+ cl::desc("Size of the load queue"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<unsigned>
StoreQueueSize("squeue",
- cl::desc("Size of the store queue (unbound by default)"),
+ cl::desc("Size of the store queue"),
cl::cat(ToolOptions), cl::init(0));
static cl::opt<bool>
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index a9a36a87ef3..6d259cbb33e 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -479,6 +479,35 @@ void CodeGenSchedModels::collectRetireControlUnits() {
}
}
+void CodeGenSchedModels::collectLoadStoreQueueInfo() {
+ RecVec Queues = Records.getAllDerivedDefinitions("MemoryQueue");
+
+ for (Record *Queue : Queues) {
+ CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
+ if (Queue->isSubClassOf("LoadQueue")) {
+ if (PM.LoadQueue) {
+ PrintError(Queue->getLoc(),
+ "Expected a single LoadQueue definition");
+ PrintNote(PM.LoadQueue->getLoc(),
+ "Previous definition of LoadQueue was here");
+ }
+
+ PM.LoadQueue = Queue;
+ }
+
+ if (Queue->isSubClassOf("StoreQueue")) {
+ if (PM.StoreQueue) {
+ PrintError(Queue->getLoc(),
+ "Expected a single StoreQueue definition");
+ PrintNote(PM.LoadQueue->getLoc(),
+ "Previous definition of StoreQueue was here");
+ }
+
+ PM.StoreQueue = Queue;
+ }
+ }
+}
+
/// Collect optional processor information.
void CodeGenSchedModels::collectOptionalProcessorInfo() {
// Find register file definitions for each processor.
@@ -487,6 +516,9 @@ void CodeGenSchedModels::collectOptionalProcessorInfo() {
// Collect processor RetireControlUnit descriptors if available.
collectRetireControlUnits();
+ // Collect information about load/store queues.
+ collectLoadStoreQueueInfo();
+
checkCompleteness();
}
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index 9bde5f4e759..87a051b0c05 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -246,10 +246,14 @@ struct CodeGenProcModel {
// Optional Retire Control Unit definition.
Record *RetireControlUnit;
+ // Load/Store queue descriptors.
+ Record *LoadQueue;
+ Record *StoreQueue;
+
CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
Record *IDef) :
Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
- RetireControlUnit(nullptr) {}
+ RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
bool hasItineraries() const {
return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -260,7 +264,8 @@ struct CodeGenProcModel {
}
bool hasExtraProcessorInfo() const {
- return RetireControlUnit || !RegisterFiles.empty();
+ return RetireControlUnit || LoadQueue || StoreQueue ||
+ !RegisterFiles.empty();
}
unsigned getProcResourceIdx(Record *PRDef) const;
@@ -607,6 +612,8 @@ private:
void collectSTIPredicates();
+ void collectLoadStoreQueueInfo();
+
void checkCompleteness();
void inferFromRW(ArrayRef<unsigned> OperWrites, ArrayRef<unsigned> OperReads,
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 8c4e1ec8511..731c14bdb9a 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -93,6 +93,8 @@ class SubtargetEmitter {
&ProcItinLists);
unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
raw_ostream &OS);
+ void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+ raw_ostream &OS);
void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
raw_ostream &OS);
void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name,
@@ -697,6 +699,30 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
return CostTblIndex;
}
+void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+ raw_ostream &OS) {
+ unsigned QueueID = 0;
+ if (ProcModel.LoadQueue) {
+ const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor");
+ QueueID =
+ 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+ std::find(ProcModel.ProcResourceDefs.begin(),
+ ProcModel.ProcResourceDefs.end(), Queue));
+ }
+ OS << " " << QueueID << ", // Resource Descriptor for the Load Queue\n";
+
+ QueueID = 0;
+ if (ProcModel.StoreQueue) {
+ const Record *Queue =
+ ProcModel.StoreQueue->getValueAsDef("QueueDescriptor");
+ QueueID =
+ 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+ std::find(ProcModel.ProcResourceDefs.begin(),
+ ProcModel.ProcResourceDefs.end(), Queue));
+ }
+ OS << " " << QueueID << ", // Resource Descriptor for the Store Queue\n";
+}
+
void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
raw_ostream &OS) {
// Generate a table of register file descriptors (one entry per each user
@@ -715,6 +741,9 @@ void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
NumCostEntries, OS);
+ // Add information about load/store queues.
+ EmitLoadStoreQueueInfo(ProcModel, OS);
+
OS << "};\n";
}
OpenPOWER on IntegriCloud