14 files changed, 312 insertions, 107 deletions
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index 41305296b00..689ac73cbdd 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -183,6 +183,8 @@ struct MCExtraProcessorInfo {
   unsigned NumRegisterFiles;
   const MCRegisterCostEntry *RegisterCostTable;
   unsigned NumRegisterCostEntries;
+  unsigned LoadQueueID;
+  unsigned StoreQueueID;
 };
 
 /// Machine model for scheduling, bundling, and heuristics.
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 3088771833c..808e183f5a5 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -561,3 +561,13 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
   int MaxRetirePerCycle = retirePerCycle;
   SchedMachineModel SchedModel = ?;
 }
+
+// Base class for Load/StoreQueue.  It is used to identify processor resources
+// which describe load/store queues in the LS unit.
+class MemoryQueue<ProcResource PR> {
+  ProcResource QueueDescriptor = PR;
+  SchedMachineModel SchedModel = ?;
+}
+
+class LoadQueue<ProcResource LDQueue> : MemoryQueue<LDQueue>;
+class StoreQueue<ProcResource STQueue> : MemoryQueue<STQueue>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 1a066b30f89..5798e1b2671 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -136,12 +136,16 @@ def PdLoad  : ProcResource<2> {
   let BufferSize = 40;
 }
 
+def PdLoadQueue : LoadQueue<PdLoad>;
+
 let Super = PdAGLU01 in
 def PdStore : ProcResource<1> {
   // For Piledriver, the store queue is 24 entries deep.
   let BufferSize = 24;
 }
 
+def PdStoreQueue : StoreQueue<PdStore>;
+
 //===----------------------------------------------------------------------===//
 // Integer Execution Units
 //
diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
index 00b03afbf0d..9c45ce63fa4 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/load-throughput.s
@@ -79,16 +79,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -102,9 +102,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
+# CHECK-NEXT: PdEX             27         30         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -192,16 +192,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -215,9 +215,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
+# CHECK-NEXT: PdEX             27         30         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -305,16 +305,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -328,9 +328,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
+# CHECK-NEXT: PdEX             27         30         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -418,16 +418,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -441,9 +441,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
+# CHECK-NEXT: PdEX             27         30         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -531,16 +531,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -554,9 +554,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
-# CHECK-NEXT: PdFPU            35         40         64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            27         30         64
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -644,16 +644,16 @@ vmovaps (%rbx), %ymm3
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            161  (77.8%)
-# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           171  (82.6%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (12.6%)
-# CHECK-NEXT:  2,              162  (78.3%)
-# CHECK-NEXT:  4,              19  (9.2%)
+# CHECK-NEXT:  0,              21  (10.1%)
+# CHECK-NEXT:  2,              172  (83.1%)
+# CHECK-NEXT:  4,              14  (6.8%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -667,9 +667,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             35         40         40
-# CHECK-NEXT: PdFPU            35         40         64
-# CHECK-NEXT: PdLoad           35         40         40
+# CHECK-NEXT: PdEX             27         30         40
+# CHECK-NEXT: PdFPU            27         30         64
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -781,7 +781,7 @@ vmovaps (%rbx), %ymm3
 # CHECK:       [1]            [2]        [3]        [4]
 # CHECK-NEXT: PdEX             1          2          40
 # CHECK-NEXT: PdFPU            1          2          64
-# CHECK-NEXT: PdLoad           1          2          40
+# CHECK-NEXT: PdLoad           11         12         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
index 76892986055..67f13c3ccdd 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
@@ -79,16 +79,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            369  (91.6%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (6.5%)
-# CHECK-NEXT:  1,              369  (91.6%)
-# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -103,10 +103,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
+# CHECK-NEXT: PdEX             22         23         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdStore          23         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -193,16 +193,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            369  (91.6%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (6.5%)
-# CHECK-NEXT:  1,              369  (91.6%)
-# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -217,10 +217,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
+# CHECK-NEXT: PdEX             22         23         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdStore          23         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -307,16 +307,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            369  (91.6%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (6.5%)
-# CHECK-NEXT:  1,              369  (91.6%)
-# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -331,10 +331,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
+# CHECK-NEXT: PdEX             22         23         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdStore          23         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -421,16 +421,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            369  (91.6%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (6.5%)
-# CHECK-NEXT:  1,              369  (91.6%)
-# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -445,10 +445,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
+# CHECK-NEXT: PdEX             22         23         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdStore          23         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -535,16 +535,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            745  (92.8%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          747  (93.0%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              423  (52.7%)
-# CHECK-NEXT:  1,              373  (46.5%)
-# CHECK-NEXT:  3,              1  (0.1%)
+# CHECK-NEXT:  0,              422  (52.6%)
+# CHECK-NEXT:  1,              374  (46.6%)
+# CHECK-NEXT:  2,              1  (0.1%)
 # CHECK-NEXT:  4,              6  (0.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -559,8 +559,8 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             23         24         40
-# CHECK-NEXT: PdFPU            23         24         64
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            22         23         64
 # CHECK-NEXT: PdLoad           0          0          40
 # CHECK-NEXT: PdStore          23         24         24
 
@@ -650,16 +650,16 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            369  (91.6%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              26  (6.5%)
-# CHECK-NEXT:  1,              369  (91.6%)
-# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  0,              25  (6.2%)
+# CHECK-NEXT:  1,              370  (91.8%)
+# CHECK-NEXT:  2,              1  (0.2%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N instructions issued:
@@ -674,10 +674,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
-# CHECK-NEXT: PdFPU            22         24         64
+# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdFPU            22         23         64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdStore          23         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -789,7 +789,7 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: PdEX             1          1          40
 # CHECK-NEXT: PdFPU            1          1          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          1          1          24
+# CHECK-NEXT: PdStore          2          2          24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
index edd6056c1e8..670f90127f1 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -19,29 +19,83 @@
 namespace llvm {
 namespace mca {
 
+SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
+    : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0),
+      NumCycles(0), MostRecentLoadDispatched(~0U),
+      MostRecentStoreDispatched(~0U),
+      IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+      Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    LQResourceID = EPI.LoadQueueID;
+    SQResourceID = EPI.StoreQueueID;
+  }
+}
+
+// FIXME: This implementation works under the assumption that load/store queue
+// entries are reserved at 'instruction dispatched' stage, and released at
+// 'instruction executed' stage. This currently matches the behavior of LSUnit.
+//
+// The current design minimizes the number of events generated by the
+// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method
+// `onEvent`. However, it introduces a subtle dependency between this view and
+// how the LSUnit works.
+//
+// In future we should add a new "memory queue" event type, so that we stop
+// making assumptions on how LSUnit internally works (See PR39828).
 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
   if (Event.Type == HWInstructionEvent::Issued)
     ++NumIssued;
+  else if (Event.Type == HWInstructionEvent::Dispatched) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    const unsigned Index = Event.IR.getSourceIndex();
+    if (LQResourceID && Inst.getDesc().MayLoad &&
+        MostRecentLoadDispatched != Index) {
+      Usage[LQResourceID].SlotsInUse++;
+      MostRecentLoadDispatched = Index;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore &&
+        MostRecentStoreDispatched != Index) {
+      Usage[SQResourceID].SlotsInUse++;
+      MostRecentStoreDispatched = Index;
+    }
+  } else if (Event.Type == HWInstructionEvent::Executed) {
+    const Instruction &Inst = *Event.IR.getInstruction();
+    if (LQResourceID && Inst.getDesc().MayLoad) {
+      assert(Usage[LQResourceID].SlotsInUse);
+      Usage[LQResourceID].SlotsInUse--;
+    }
+    if (SQResourceID && Inst.getDesc().MayStore) {
+      assert(Usage[SQResourceID].SlotsInUse);
+      Usage[SQResourceID].SlotsInUse--;
+    }
+  }
 }
 
 void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
   for (const unsigned Buffer : Buffers) {
-    BufferUsage &BU = Usage[Buffer];
-    BU.SlotsInUse++;
-    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
+    Usage[Buffer].SlotsInUse++;
   }
 }
 
 void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */,
                                             ArrayRef<unsigned> Buffers) {
-  for (const unsigned Buffer : Buffers)
+  for (const unsigned Buffer : Buffers) {
+    if (Buffer == LQResourceID || Buffer == SQResourceID)
+      continue;
     Usage[Buffer].SlotsInUse--;
+  }
 }
 
 void SchedulerStatistics::updateHistograms() {
-  for (BufferUsage &BU : Usage)
+  for (BufferUsage &BU : Usage) {
     BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
+  }
+
   IssuedPerCycle[NumIssued]++;
   NumIssued = 0;
 }
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 56dd3af1912..d99a395a726 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -47,9 +47,15 @@ namespace mca {
 
 class SchedulerStatistics final : public View {
   const llvm::MCSchedModel &SM;
+  unsigned LQResourceID;
+  unsigned SQResourceID;
+
   unsigned NumIssued;
   unsigned NumCycles;
 
+  unsigned MostRecentLoadDispatched;
+  unsigned MostRecentStoreDispatched;
+
   // Tracks the usage of a scheduler's queue.
   struct BufferUsage {
     unsigned SlotsInUse;
@@ -65,11 +71,7 @@ class SchedulerStatistics final : public View {
   void printSchedulerUsage(llvm::raw_ostream &OS) const;
 
 public:
-  SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
-      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
-        IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
-        Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
-
+  SchedulerStatistics(const llvm::MCSubtargetInfo &STI);
   void onEvent(const HWInstructionEvent &Event) override;
   void onCycleBegin() override { NumCycles++; }
   void onCycleEnd() override { updateHistograms(); }
diff --git a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index bfe3b01c4de..f8c0722b540 100644
--- a/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/llvm/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -18,6 +18,7 @@
 
 #include "HardwareUnits/HardwareUnit.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/MC/MCSchedule.h"
 
 namespace llvm {
 namespace mca {
@@ -99,6 +100,44 @@ class LSUnit : public HardwareUnit {
   // If true, loads will never alias with stores. This is the default.
   bool NoAlias;
 
+  // When a `MayLoad` instruction is dispatched to the schedulers for execution,
+  // the LSUnit reserves an entry in the `LoadQueue` for it.
+  //
+  // LoadQueue keeps track of all the loads that are in-flight. A load
+  // instruction is eventually removed from the LoadQueue when it reaches
+  // completion stage. That means, a load leaves the queue whe it is 'executed',
+  // and its value can be forwarded on the data path to outside units.
+  //
+  // This class doesn't know about the latency of a load instruction. So, it
+  // conservatively/pessimistically assumes that the latency of a load opcode
+  // matches the instruction latency. 
+  //
+  // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
+  // and load/store conflicts, the latency of a load is determined by the depth
+  // of the load pipeline. So, we could use field `LoadLatency` in the
+  // MCSchedModel to model that latency.
+  // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
+  // L1D, and it usually already accounts for any extra latency due to data
+  // forwarding.
+  // When doing throughput analysis, `LoadLatency` is likely to
+  // be a better predictor of load latency than instruction latency. This is
+  // particularly true when simulating code with temporal/spatial locality of
+  // memory accesses.
+  // Using `LoadLatency` (instead of the instruction latency) is also expected
+  // to improve the load queue allocation for long latency instructions with
+  // folded memory operands (See PR39829).
+  //
+  // FIXME: On some processors, load/store operations are split into multiple
+  // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
+  // not 256-bit data types. So, a 256-bit load is effectively split into two
+  // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
+  // simplicity, this class optimistically assumes that a load instruction only
+  // consumes one entry in the LoadQueue.  Similarly, store instructions only
+  // consume a single entry in the StoreQueue.
+  // In future, we should reassess the quality of this design, and consider
+  // alternative approaches that let instructions specify the number of
+  // load/store queue entries which they consume at dispatch stage (See
+  // PR39830).
   SmallSet<unsigned, 16> LoadQueue;
   SmallSet<unsigned, 16> StoreQueue;
 
@@ -122,8 +161,8 @@ class LSUnit : public HardwareUnit {
   bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
 
 public:
-  LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
-      : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+  LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0,
+         bool AssumeNoAlias = false);
 
 #ifndef NDEBUG
   void dump() const;
@@ -149,6 +188,15 @@ public:
   // 5. A load has to wait until an older load barrier is fully executed.
   // 6. A store has to wait until an older store barrier is fully executed.
   virtual bool isReady(const InstRef &IR) const;
+
+  // Load and store instructions are tracked by their corresponding queues from
+  // dispatch until the "instruction executed" event.
+  // Only when a load instruction reaches the 'Executed' stage, its value
+  // becomes available to the users. At that point, the load no longer needs to
+  // be tracked by the load queue.
+  // FIXME: For simplicity, we optimistically assume a similar behavior for
+  // store instructions.  In practice, store operation don't tend to leave the
+  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR);
 };
 
diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp
index 6774a57d29b..d472ae3313a 100644
--- a/llvm/tools/llvm-mca/lib/Context.cpp
+++ b/llvm/tools/llvm-mca/lib/Context.cpp
@@ -35,8 +35,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   // Create the hardware units defining the backend.
   auto RCU = llvm::make_unique<RetireControlUnit>(SM);
   auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
-  auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
-                                       Opts.AssumeNoAlias);
+  auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+                                       Opts.StoreQueueSize, Opts.AssumeNoAlias);
   auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
 
   // Create the pipeline stages.
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index ae020c68432..ed8269167fe 100644
--- a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -22,6 +22,23 @@
 namespace llvm {
 namespace mca {
 
+LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
+               bool AssumeNoAlias)
+    : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (!LQ_Size && EPI.LoadQueueID) {
+      const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
+      LQ_Size = LdQDesc.BufferSize;
+    }
+
+    if (!SQ_Size && EPI.StoreQueueID) {
+      const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
+      SQ_Size = StQDesc.BufferSize;
+    }
+  }
+}
+
 #ifndef NDEBUG
 void LSUnit::dump() const {
   dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 985889677de..a5edbcebc88 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -151,12 +151,12 @@ static cl::opt<bool>
 
 static cl::opt<unsigned>
     LoadQueueSize("lqueue",
-                  cl::desc("Size of the load queue (unbound by default)"),
+                  cl::desc("Size of the load queue"),
                   cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<unsigned>
     StoreQueueSize("squeue",
-                   cl::desc("Size of the store queue (unbound by default)"),
+                   cl::desc("Size of the store queue"),
                    cl::cat(ToolOptions), cl::init(0));
 
 static cl::opt<bool>
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index a9a36a87ef3..6d259cbb33e 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -479,6 +479,35 @@ void CodeGenSchedModels::collectRetireControlUnits() {
   }
 }
 
+void CodeGenSchedModels::collectLoadStoreQueueInfo() {
+  RecVec Queues = Records.getAllDerivedDefinitions("MemoryQueue");
+
+  for (Record *Queue : Queues) {
+    CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
+    if (Queue->isSubClassOf("LoadQueue")) {
+      if (PM.LoadQueue) {
+        PrintError(Queue->getLoc(),
+                   "Expected a single LoadQueue definition");
+        PrintNote(PM.LoadQueue->getLoc(),
+                  "Previous definition of LoadQueue was here");
+      }
+
+      PM.LoadQueue = Queue;
+    }
+
+    if (Queue->isSubClassOf("StoreQueue")) {
+      if (PM.StoreQueue) {
+        PrintError(Queue->getLoc(),
+                   "Expected a single StoreQueue definition");
+        PrintNote(PM.LoadQueue->getLoc(),
+                  "Previous definition of StoreQueue was here");
+      }
+
+      PM.StoreQueue = Queue;
+    }
+  }
+}
+
 /// Collect optional processor information.
 void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Find register file definitions for each processor.
@@ -487,6 +516,9 @@ void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Collect processor RetireControlUnit descriptors if available.
   collectRetireControlUnits();
 
+  // Collect information about load/store queues.
+  collectLoadStoreQueueInfo();
+
   checkCompleteness();
 }
 
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index 9bde5f4e759..87a051b0c05 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -246,10 +246,14 @@ struct CodeGenProcModel {
   // Optional Retire Control Unit definition.
   Record *RetireControlUnit;
 
+  // Load/Store queue descriptors.
+  Record *LoadQueue;
+  Record *StoreQueue;
+
   CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
                    Record *IDef) :
     Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-    RetireControlUnit(nullptr) {}
+    RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -260,7 +264,8 @@ struct CodeGenProcModel {
   }
 
   bool hasExtraProcessorInfo() const {
-    return RetireControlUnit || !RegisterFiles.empty();
+    return RetireControlUnit || LoadQueue || StoreQueue ||
+           !RegisterFiles.empty();
   }
 
   unsigned getProcResourceIdx(Record *PRDef) const;
@@ -607,6 +612,8 @@ private:
 
   void collectSTIPredicates();
 
+  void collectLoadStoreQueueInfo();
+
   void checkCompleteness();
 
   void inferFromRW(ArrayRef<unsigned> OperWrites, ArrayRef<unsigned> OperReads,
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 8c4e1ec8511..731c14bdb9a 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -93,6 +93,8 @@ class SubtargetEmitter {
                          &ProcItinLists);
   unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
                                   raw_ostream &OS);
+  void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+                              raw_ostream &OS);
   void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                               raw_ostream &OS);
   void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name,
@@ -697,6 +699,30 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
   return CostTblIndex;
 }
 
+void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
+                                              raw_ostream &OS) {
+  unsigned QueueID = 0;
+  if (ProcModel.LoadQueue) {
+    const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor");
+    QueueID =
+        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                          std::find(ProcModel.ProcResourceDefs.begin(),
+                                    ProcModel.ProcResourceDefs.end(), Queue));
+  }
+  OS << "  " << QueueID << ", // Resource Descriptor for the Load Queue\n";
+
+  QueueID = 0;
+  if (ProcModel.StoreQueue) {
+    const Record *Queue =
+        ProcModel.StoreQueue->getValueAsDef("QueueDescriptor");
+    QueueID =
+        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                          std::find(ProcModel.ProcResourceDefs.begin(),
+                                    ProcModel.ProcResourceDefs.end(), Queue));
+  }
+  OS << "  " << QueueID << ", // Resource Descriptor for the Store Queue\n";
+}
+
 void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                                               raw_ostream &OS) {
   // Generate a table of register file descriptors (one entry per each user
@@ -715,6 +741,9 @@ void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
   EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
                        NumCostEntries, OS);
 
+  // Add information about load/store queues.
+  EmitLoadStoreQueueInfo(ProcModel, OS);
+
   OS << "};\n";
 }