summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/MC/MCSchedule.h6
-rw-r--r--llvm/include/llvm/MC/MCSubtargetInfo.h10
-rw-r--r--llvm/include/llvm/MCA/Instruction.h4
-rw-r--r--llvm/lib/CodeGen/TargetSubtargetInfo.cpp18
-rw-r--r--llvm/lib/MC/MCSchedule.cpp16
-rw-r--r--llvm/lib/MCA/InstrBuilder.cpp1
-rw-r--r--llvm/lib/Target/X86/X86InstrMMX.td2
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td8
-rw-r--r--llvm/lib/Target/X86/X86SchedBroadwell.td2
-rw-r--r--llvm/lib/Target/X86/X86SchedHaswell.td2
-rw-r--r--llvm/lib/Target/X86/X86SchedSandyBridge.td2
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td2
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td2
-rw-r--r--llvm/lib/Target/X86/X86Schedule.td6
-rw-r--r--llvm/lib/Target/X86/X86ScheduleAtom.td2
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBdVer2.td2
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBtVer2.td7
-rw-r--r--llvm/lib/Target/X86/X86ScheduleSLM.td2
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver1.td2
-rw-r--r--llvm/test/CodeGen/X86/mmx-schedule.ll2
-rw-r--r--llvm/test/CodeGen/X86/sse41-schedule.ll4
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s24
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s34
-rw-r--r--llvm/tools/llvm-mca/Views/InstructionInfoView.cpp3
24 files changed, 123 insertions, 40 deletions
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index 25a3a9cdb8f..df3248ee6e8 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -14,6 +14,7 @@
#ifndef LLVM_MC_MCSCHEDULE_H
#define LLVM_MC_MCSCHEDULE_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/Support/DataTypes.h"
@@ -369,6 +370,11 @@ struct MCSchedModel {
getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
const MCInst &Inst) const;
+ /// Returns the maximum forwarding delay for register reads dependent on
+ /// writes of scheduling class WriteResourceIdx.
+ static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
+ unsigned WriteResourceIdx = 0);
+
/// Returns the default initialized model.
static const MCSchedModel &GetDefaultSchedModel() { return Default; }
static const MCSchedModel Default;
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index 03eea1e8dce..2ad72c3c325 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -152,6 +152,16 @@ public:
return 0;
}
+ /// Return the set of ReadAdvance entries declared by the scheduling class
+ /// descriptor in input.
+ ArrayRef<MCReadAdvanceEntry>
+ getReadAdvanceEntries(const MCSchedClassDesc &SC) const {
+ if (!SC.NumReadAdvanceEntries)
+ return ArrayRef<MCReadAdvanceEntry>();
+ return ArrayRef<MCReadAdvanceEntry>(&ReadAdvanceTable[SC.ReadAdvanceIdx],
+ SC.NumReadAdvanceEntries);
+ }
+
/// Get scheduling itinerary of a CPU.
InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 27d135e3510..3effa2b7654 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -332,6 +332,10 @@ struct InstrDesc {
unsigned MaxLatency;
// Number of MicroOps for this instruction.
unsigned NumMicroOps;
+ // SchedClassID used to construct this InstrDesc.
+ // This information is currently used by views to do fast queries on the
+ // subtarget when computing the reciprocal throughput.
+ unsigned SchedClassID;
bool MayLoad;
bool MayStore;
diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index c9f90b88cac..e34f9a1579d 100644
--- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -88,6 +88,12 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
TargetSchedModel TSchedModel;
TSchedModel.init(this);
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+
+ // Add extra latency due to forwarding delays.
+ const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI);
+ Latency +=
+ MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+
double RThroughput = TSchedModel.computeReciprocalThroughput(&MI);
return createSchedInfoStr(Latency, RThroughput);
}
@@ -99,9 +105,17 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
TargetSchedModel TSchedModel;
TSchedModel.init(this);
unsigned Latency;
- if (TSchedModel.hasInstrSchedModel())
+ if (TSchedModel.hasInstrSchedModel()) {
Latency = TSchedModel.computeInstrLatency(MCI);
- else if (TSchedModel.hasInstrItineraries()) {
+ // Add extra latency due to forwarding delays.
+ const MCSchedModel &SM = *TSchedModel.getMCSchedModel();
+ unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass();
+ while (SM.getSchedClassDesc(SClassID)->isVariant())
+ SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID);
+ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID);
+ Latency +=
+ MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+ } else if (TSchedModel.hasInstrItineraries()) {
auto *ItinData = TSchedModel.getInstrItineraries();
Latency = ItinData->getStageLatency(
getInstrInfo()->get(MCI.getOpcode()).getSchedClass());
diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
index 6797a47c75a..1fc5ec5e975 100644
--- a/llvm/lib/MC/MCSchedule.cpp
+++ b/llvm/lib/MC/MCSchedule.cpp
@@ -149,3 +149,19 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass,
// that it can execute at the maximum default issue width.
return 1.0 / DefaultIssueWidth;
}
+
+unsigned
+MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
+ unsigned WriteResourceID) {
+ if (Entries.empty())
+ return 0;
+
+ int DelayCycles = 0;
+ for (const MCReadAdvanceEntry &E : Entries) {
+ if (E.WriteResourceID != WriteResourceID)
+ continue;
+ DelayCycles = std::min(DelayCycles, E.Cycles);
+ }
+
+ return std::abs(DelayCycles);
+}
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 4b0ec329f9e..1e08f898523 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -532,6 +532,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
// Create a new empty descriptor.
std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
ID->NumMicroOps = SCDesc.NumMicroOps;
+ ID->SchedClassID = SchedClassID;
if (MCDesc.isCall() && FirstCallInst) {
// We don't correctly model calls.
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index c00c4f4ca09..8e2a45b1bed 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -543,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 807af7f4808..e6427b1764a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4122,7 +4122,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
i16mem:$src2, u8imm:$src3),
@@ -5577,7 +5577,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5603,7 +5603,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5629,7 +5629,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
!if(Is2Addr,
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 03feff138e6..821948d2d85 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -81,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 6>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 155d77224a6..dc040e0e7d3 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -86,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 4f47acde7cf..503b905842b 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -76,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 7d84d814d94..71045897376 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 0d452f9beb3..8bd80078891 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index 886c3aef3f5..f50cb621046 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -17,6 +17,12 @@ def ReadAfterVecLd : SchedRead;
def ReadAfterVecXLd : SchedRead;
def ReadAfterVecYLd : SchedRead;
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
// Instructions with both a load and a store folded are modeled as a folded
// load + WriteRMW.
def WriteRMW : SchedWrite;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index 779692b7da6..bf50aeee1df 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -46,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index ca14ed478f0..90ca79915fa 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -250,6 +250,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// A folded store needs a cycle on the PdStore for the store data.
def : WriteRes<WriteRMW, [PdStore]>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 8d8de3e8e15..3a2ed733f56 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -108,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
@@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 5dca0ff7019..fc150fca545 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -52,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 0407afc4203..1a75281cf0c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -94,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>;
def : ReadAdvance<ReadAfterVecXLd, 8>;
def : ReadAdvance<ReadAfterVecYLd, 8>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// The Integer PRF for Zen is 168 entries, and it holds the architectural and
// speculative version of the 64-bit integer registers.
// Reference: "Software Optimization Guide for AMD Family 17h Processors"
diff --git a/llvm/test/CodeGen/X86/mmx-schedule.ll b/llvm/test/CodeGen/X86/mmx-schedule.ll
index 51dc5e102ff..d423b9a2a90 100644
--- a/llvm/test/CodeGen/X86/mmx-schedule.ll
+++ b/llvm/test/CodeGen/X86/mmx-schedule.ll
@@ -3887,8 +3887,8 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
;
; BTVER2-LABEL: test_pinsrw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00]
+; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll
index ea606463fc1..4870434a8ae 100644
--- a/llvm/test/CodeGen/X86/sse41-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse41-schedule.ll
@@ -2679,15 +2679,15 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
;
; BTVER2-SSE-LABEL: test_pinsrq:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_pinsrq:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00]
+; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s
index 9e44702ae79..398a52a8479 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s
@@ -27,12 +27,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -76,12 +76,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -125,12 +125,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -174,12 +174,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s
index 4e130be8597..00c13f9ef59 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s
@@ -9,12 +9,12 @@ vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1500
-# CHECK-NEXT: Total Cycles: 7004
+# CHECK-NEXT: Total Cycles: 1509
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.21
+# CHECK-NEXT: uOps Per Cycle: 1.66
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Instruction Info:
@@ -57,18 +57,18 @@ vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012345
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . . . . . . . . addl %eax, %eax
-# CHECK-NEXT: [0,1] .DeeeeeeeER . . . . . . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [0,2] . D======eeeeeeeER . . . . . . vpinsrb $1, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [1,0] . DeE-----------R . . . . . . addl %eax, %eax
-# CHECK-NEXT: [1,1] . D===========eeeeeeeER. . . . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [1,2] . D=================eeeeeeeER . . . vpinsrb $1, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [2,0] . .DeE----------------------R . . . addl %eax, %eax
-# CHECK-NEXT: [2,1] . . D======================eeeeeeeER . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [2,2] . . D============================eeeeeeeER vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK: [0,0] DeER . . . . addl %eax, %eax
+# CHECK-NEXT: [0,1] .D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [0,2] . D======eER . . vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [1,0] . DeE-----R . . addl %eax, %eax
+# CHECK-NEXT: [1,1] . D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [1,2] . D======eER. . vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [2,0] . .DeE-----R. . addl %eax, %eax
+# CHECK-NEXT: [2,1] . . D======eER. vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [2,2] . . D======eER vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -77,6 +77,6 @@ vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.0 1.0 11.0 addl %eax, %eax
-# CHECK-NEXT: 1. 3 12.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: 2. 3 18.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax
+# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
index 60b8b1f5141..1fbffa3e5b6 100644
--- a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -43,6 +43,9 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+ // Add extra latency due to delays in the forwarding data paths.
+ Latency += MCSchedModel::getForwardingDelayCycles(
+ STI.getReadAdvanceEntries(SCDesc));
Optional<double> RThroughput =
MCSchedModel::getReciprocalThroughput(STI, SCDesc);
OpenPOWER on IntegriCloud