diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/TargetSubtargetInfo.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/MC/MCSchedule.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/MCA/InstrBuilder.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrMMX.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedBroadwell.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeClient.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeServer.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleAtom.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBdVer2.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver1.td | 2 |
16 files changed, 68 insertions, 8 deletions
diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index c9f90b88cac..e34f9a1579d 100644 --- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -88,6 +88,12 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency = TSchedModel.computeInstrLatency(&MI); + + // Add extra latency due to forwarding delays. + const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + double RThroughput = TSchedModel.computeReciprocalThroughput(&MI); return createSchedInfoStr(Latency, RThroughput); } @@ -99,9 +105,17 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency; - if (TSchedModel.hasInstrSchedModel()) + if (TSchedModel.hasInstrSchedModel()) { Latency = TSchedModel.computeInstrLatency(MCI); - else if (TSchedModel.hasInstrItineraries()) { + // Add extra latency due to forwarding delays. + const MCSchedModel &SM = *TSchedModel.getMCSchedModel(); + unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass(); + while (SM.getSchedClassDesc(SClassID)->isVariant()) + SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID); + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + } else if (TSchedModel.hasInstrItineraries()) { auto *ItinData = TSchedModel.getInstrItineraries(); Latency = ItinData->getStageLatency( getInstrInfo()->get(MCI.getOpcode()).getSchedClass()); diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp index 6797a47c75a..1fc5ec5e975 100644 --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -149,3 +149,19 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass, // that it can execute at the maximum default issue width. return 1.0 / DefaultIssueWidth; } + +unsigned +MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, + unsigned WriteResourceID) { + if (Entries.empty()) + return 0; + + int DelayCycles = 0; + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID != WriteResourceID) + continue; + DelayCycles = std::min(DelayCycles, E.Cycles); + } + + return std::abs(DelayCycles); +} diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp index 4b0ec329f9e..1e08f898523 100644 --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -532,6 +532,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Create a new empty descriptor. std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>(); ID->NumMicroOps = SCDesc.NumMicroOps; + ID->SchedClassID = SchedClassID; if (MCDesc.isCall() && FirstCallInst) { // We don't correctly model calls. diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index c00c4f4ca09..8e2a45b1bed 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -543,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 807af7f4808..e6427b1764a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4122,7 +4122,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -5577,7 +5577,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5603,7 +5603,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5629,7 +5629,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2, u8imm:$src3), !if(Is2Addr, diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 03feff138e6..821948d2d85 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -81,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 6>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 155d77224a6..dc040e0e7d3 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -86,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 4f47acde7cf..503b905842b 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -76,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 7d84d814d94..71045897376 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 0d452f9beb3..8bd80078891 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 886c3aef3f5..f50cb621046 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -17,6 +17,12 @@ def ReadAfterVecLd : SchedRead; def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 779692b7da6..bf50aeee1df 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -46,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td index ca14ed478f0..90ca79915fa 100644 --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -250,6 +250,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes<WriteRMW, [PdStore]>; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 8d8de3e8e15..3a2ed733f56 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -108,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance<ReadInt2Fpu, -6>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 5dca0ff7019..fc150fca545 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -52,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 0407afc4203..1a75281cf0c 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -94,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>; def : ReadAdvance<ReadAfterVecXLd, 8>; def : ReadAdvance<ReadAfterVecYLd, 8>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" |