diff options
24 files changed, 123 insertions, 40 deletions
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h index 25a3a9cdb8f..df3248ee6e8 100644 --- a/llvm/include/llvm/MC/MCSchedule.h +++ b/llvm/include/llvm/MC/MCSchedule.h @@ -14,6 +14,7 @@ #ifndef LLVM_MC_MCSCHEDULE_H #define LLVM_MC_MCSCHEDULE_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/DataTypes.h" @@ -369,6 +370,11 @@ struct MCSchedModel { getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, const MCInst &Inst) const; + /// Returns the maximum forwarding delay for register reads dependent on + /// writes of scheduling class WriteResourceIdx. + static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, + unsigned WriteResourceIdx = 0); + /// Returns the default initialized model. static const MCSchedModel &GetDefaultSchedModel() { return Default; } static const MCSchedModel Default; diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h index 03eea1e8dce..2ad72c3c325 100644 --- a/llvm/include/llvm/MC/MCSubtargetInfo.h +++ b/llvm/include/llvm/MC/MCSubtargetInfo.h @@ -152,6 +152,16 @@ public: return 0; } + /// Return the set of ReadAdvance entries declared by the scheduling class + /// descriptor in input. + ArrayRef<MCReadAdvanceEntry> + getReadAdvanceEntries(const MCSchedClassDesc &SC) const { + if (!SC.NumReadAdvanceEntries) + return ArrayRef<MCReadAdvanceEntry>(); + return ArrayRef<MCReadAdvanceEntry>(&ReadAdvanceTable[SC.ReadAdvanceIdx], + SC.NumReadAdvanceEntries); + } + /// Get scheduling itinerary of a CPU. InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const; diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h index 27d135e3510..3effa2b7654 100644 --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -332,6 +332,10 @@ struct InstrDesc { unsigned MaxLatency; // Number of MicroOps for this instruction. unsigned NumMicroOps; + // SchedClassID used to construct this InstrDesc. + // This information is currently used by views to do fast queries on the + // subtarget when computing the reciprocal throughput. + unsigned SchedClassID; bool MayLoad; bool MayStore; diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index c9f90b88cac..e34f9a1579d 100644 --- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -88,6 +88,12 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency = TSchedModel.computeInstrLatency(&MI); + + // Add extra latency due to forwarding delays. + const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + double RThroughput = TSchedModel.computeReciprocalThroughput(&MI); return createSchedInfoStr(Latency, RThroughput); } @@ -99,9 +105,17 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency; - if (TSchedModel.hasInstrSchedModel()) + if (TSchedModel.hasInstrSchedModel()) { Latency = TSchedModel.computeInstrLatency(MCI); - else if (TSchedModel.hasInstrItineraries()) { + // Add extra latency due to forwarding delays. + const MCSchedModel &SM = *TSchedModel.getMCSchedModel(); + unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass(); + while (SM.getSchedClassDesc(SClassID)->isVariant()) + SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID); + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + } else if (TSchedModel.hasInstrItineraries()) { auto *ItinData = TSchedModel.getInstrItineraries(); Latency = ItinData->getStageLatency( getInstrInfo()->get(MCI.getOpcode()).getSchedClass()); diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp index 6797a47c75a..1fc5ec5e975 100644 --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -149,3 +149,19 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass, // that it can execute at the maximum default issue width. return 1.0 / DefaultIssueWidth; } + +unsigned +MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, + unsigned WriteResourceID) { + if (Entries.empty()) + return 0; + + int DelayCycles = 0; + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID != WriteResourceID) + continue; + DelayCycles = std::min(DelayCycles, E.Cycles); + } + + return std::abs(DelayCycles); +} diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp index 4b0ec329f9e..1e08f898523 100644 --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -532,6 +532,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Create a new empty descriptor. std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>(); ID->NumMicroOps = SCDesc.NumMicroOps; + ID->SchedClassID = SchedClassID; if (MCDesc.isCall() && FirstCallInst) { // We don't correctly model calls. diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index c00c4f4ca09..8e2a45b1bed 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -543,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 807af7f4808..e6427b1764a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4122,7 +4122,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -5577,7 +5577,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5603,7 +5603,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5629,7 +5629,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2, u8imm:$src3), !if(Is2Addr, diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 03feff138e6..821948d2d85 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -81,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 6>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 155d77224a6..dc040e0e7d3 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -86,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 4f47acde7cf..503b905842b 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -76,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 7d84d814d94..71045897376 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 0d452f9beb3..8bd80078891 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 886c3aef3f5..f50cb621046 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -17,6 +17,12 @@ def ReadAfterVecLd : SchedRead; def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 779692b7da6..bf50aeee1df 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -46,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td index ca14ed478f0..90ca79915fa 100644 --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -250,6 +250,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes<WriteRMW, [PdStore]>; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 8d8de3e8e15..3a2ed733f56 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -108,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance<ReadInt2Fpu, -6>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 5dca0ff7019..fc150fca545 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -52,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 0407afc4203..1a75281cf0c 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -94,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>; def : ReadAdvance<ReadAfterVecXLd, 8>; def : ReadAdvance<ReadAfterVecYLd, 8>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" diff --git a/llvm/test/CodeGen/X86/mmx-schedule.ll b/llvm/test/CodeGen/X86/mmx-schedule.ll index 51dc5e102ff..d423b9a2a90 100644 --- a/llvm/test/CodeGen/X86/mmx-schedule.ll +++ b/llvm/test/CodeGen/X86/mmx-schedule.ll @@ -3887,8 +3887,8 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize { ; ; BTVER2-LABEL: test_pinsrw: ; BTVER2: # %bb.0: -; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00] +; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll index ea606463fc1..4870434a8ae 100644 --- a/llvm/test/CodeGen/X86/sse41-schedule.ll +++ b/llvm/test/CodeGen/X86/sse41-schedule.ll @@ -2679,15 +2679,15 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { ; ; BTVER2-SSE-LABEL: test_pinsrq: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00] +; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pinsrq: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00] +; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s index 9e44702ae79..398a52a8479 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s @@ -27,12 +27,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -76,12 +76,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -125,12 +125,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -174,12 +174,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s index 4e130be8597..00c13f9ef59 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s @@ -9,12 +9,12 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1500 -# CHECK-NEXT: Total Cycles: 7004 +# CHECK-NEXT: Total Cycles: 1509 # CHECK-NEXT: Total uOps: 2500 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.36 -# CHECK-NEXT: IPC: 0.21 +# CHECK-NEXT: uOps Per Cycle: 1.66 +# CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Instruction Info: @@ -57,18 +57,18 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 012345 +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . . . . . . . . addl %eax, %eax -# CHECK-NEXT: [0,1] .DeeeeeeeER . . . . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [0,2] . D======eeeeeeeER . . . . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,0] . DeE-----------R . . . . . . addl %eax, %eax -# CHECK-NEXT: [1,1] . D===========eeeeeeeER. . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,2] . D=================eeeeeeeER . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,0] . .DeE----------------------R . . . addl %eax, %eax -# CHECK-NEXT: [2,1] . . D======================eeeeeeeER . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,2] . . D============================eeeeeeeER vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK: [0,0] DeER . . . . addl %eax, %eax +# CHECK-NEXT: [0,1] .D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [0,2] . D======eER . . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,0] . DeE-----R . . addl %eax, %eax +# CHECK-NEXT: [1,1] . D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,2] . D======eER. . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,0] . .DeE-----R. . addl %eax, %eax +# CHECK-NEXT: [2,1] . . D======eER. vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,2] . . D======eER vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -77,6 +77,6 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 1.0 11.0 addl %eax, %eax -# CHECK-NEXT: 1. 3 12.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2. 3 18.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax +# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp index 60b8b1f5141..1fbffa3e5b6 100644 --- a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp +++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp @@ -43,6 +43,9 @@ void InstructionInfoView::printView(raw_ostream &OS) const { const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); unsigned NumMicroOpcodes = SCDesc.NumMicroOps; unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc); + // Add extra latency due to delays in the forwarding data paths. + Latency += MCSchedModel::getForwardingDelayCycles( + STI.getReadAdvanceEntries(SCDesc)); Optional<double> RThroughput = MCSchedModel::getReciprocalThroughput(STI, SCDesc); |