diff options
author | Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> | 2018-06-04 15:43:09 +0000 |
---|---|---|
committer | Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> | 2018-06-04 15:43:09 +0000 |
commit | 39e5a5695fd3e560565b40d1b60b4a1e78665875 (patch) | |
tree | 3945b3ad7cc4465a52e631ce891e42a057ddb79c | |
parent | ab60a2823f1a6548c17c57abbbafdb4ddb3bb785 (diff) | |
download | bcm5719-llvm-39e5a5695fd3e560565b40d1b60b4a1e78665875.tar.gz bcm5719-llvm-39e5a5695fd3e560565b40d1b60b4a1e78665875.zip |
[RFC][patch 3/3] Add support for variant scheduling classes in llvm-mca.
This patch is the last of a sequence of three patches related to LLVM-dev RFC
"MC support for variant scheduling classes".
http://lists.llvm.org/pipermail/llvm-dev/2018-May/123181.html
This fixes PR36672.
The main goal of this patch is to teach llvm-mca how to solve variant scheduling
classes. This patch does that, plus it adds new variant scheduling classes to
the BtVer2 scheduling model to identify so-called zero-idioms (i.e. so-called
dependency breaking instructions that are known to generate zero, and that are
optimized out in hardware at register renaming stage).
Without the BtVer2 change, this patch would not have had any meaningful tests.
This patch is effectively the union of two changes:
1) a change that teaches llvm-mca how to resolve variant scheduling classes.
2) a change to the BtVer2 scheduling model that allows us to special-case
packed XOR zero-idioms (this partially fixes PR36671).
Differential Revision: https://reviews.llvm.org/D47374
llvm-svn: 333909
-rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 33 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse-schedule.ll | 4 | ||||
-rw-r--r-- | llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s | 153 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/InstrBuilder.cpp | 40 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/InstrBuilder.h | 3 | ||||
-rw-r--r-- | llvm/tools/llvm-mca/InstructionInfoView.cpp | 11 |
7 files changed, 230 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index ccee972c482..77e7f2e0f79 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -559,6 +559,11 @@ def SchedWriteFShuffleSizes : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>; //===----------------------------------------------------------------------===// +// Common MCInstPredicate definitions used by variant scheduling classes. + +def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>; + +//===----------------------------------------------------------------------===// // Generic Processor Scheduler Models. // IssueWidth is analogous to the number of decode units. Core and its diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 764d097e369..721088457a3 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -546,5 +546,36 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> { let NumMicroOps = 37; } def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; -} // SchedModel +/////////////////////////////////////////////////////////////////////////////// +// SchedWriteVariant definitions. +/////////////////////////////////////////////////////////////////////////////// + +def JWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +// Vector XOR instructions that use the same register for both source +// operands do not have a real dependency on the previous contents of the +// register, and thus, do not have to wait before completing. They can be +// optimized out at register renaming stage. +// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family +// 15h Processors". +// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// Section 21.8 [Dependency-breaking instructions]. + +def JWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]> +]>; + +def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr)>; + +def JWriteVZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]> +]>; + +def : InstRW<[JWriteVZeroIdiom], (instrs PXORrr, VPXORrr)>; + +} // SchedModel diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll index 9207ba502e5..7f0cc213bbc 100644 --- a/llvm/test/CodeGen/X86/sse-schedule.ll +++ b/llvm/test/CodeGen/X86/sse-schedule.ll @@ -6225,7 +6225,7 @@ define <4 x float> @test_fnop() nounwind { ; ; BTVER2-SSE-LABEL: test_fnop: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: xorps %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-SSE-NEXT: xorps %xmm0, %xmm0 # sched: [0:?] ; BTVER2-SSE-NEXT: #APP ; BTVER2-SSE-NEXT: nop # sched: [1:0.50] ; BTVER2-SSE-NEXT: #NO_APP @@ -6233,7 +6233,7 @@ define <4 x float> @test_fnop() nounwind { ; ; BTVER2-LABEL: test_fnop: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [0:?] ; BTVER2-NEXT: #APP ; BTVER2-NEXT: nop # sched: [1:0.50] ; BTVER2-NEXT: #NO_APP diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s new file mode 100644 index 00000000000..e51ce28d580 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s @@ -0,0 +1,153 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -register-file-stats -iterations=5 < %s | FileCheck %s + +xorps %xmm0, %xmm0 +xorpd %xmm1, %xmm1 +vxorps %xmm2, %xmm2, %xmm2 +vxorpd %xmm1, %xmm1, %xmm1 +pxor %xmm2, %xmm2 +vpxor %xmm3, %xmm3, %xmm3 + +vxorps %xmm4, %xmm4, %xmm5 +vxorpd %xmm1, %xmm1, %xmm3 +vpxor %xmm3, %xmm3, %xmm5 + +# CHECK: Iterations: 5 +# CHECK-NEXT: Instructions: 45 +# CHECK-NEXT: Total Cycles: 24 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 1.88 +# CHECK-NEXT: Block RThroughput: 4.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 0 - xorps %xmm0, %xmm0 +# CHECK-NEXT: 1 0 - xorpd %xmm1, %xmm1 +# CHECK-NEXT: 1 0 - vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: 1 0 - vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: 1 0 - pxor %xmm2, %xmm2 +# CHECK-NEXT: 1 0 - vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 1 0 - vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: 1 0 - vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: 1 0 - vpxor %xmm3, %xmm3, %xmm5 + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: * Register File #1 -- JFpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: * Register File #2 -- JIntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - - - - - - - - xorps %xmm0, %xmm0 +# CHECK-NEXT: - - - - - - - - - - - - - - xorpd %xmm1, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - pxor %xmm2, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: - - - - - - - - - - - - - - vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - vpxor %xmm3, %xmm3, %xmm5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123 + +# CHECK: [0,0] DR . . . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [0,1] DR . . . . . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [0,2] .DR . . . . . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [0,3] .DR . . . . . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [0,4] . DR . . . . . pxor %xmm2, %xmm2 +# CHECK-NEXT: [0,5] . DR . . . . . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,6] . DR. . . . . vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [0,7] . DR. . . . . vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [0,8] . DR . . . . vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [1,0] . DR . . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [1,1] . DR . . . . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [1,2] . DR . . . . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [1,3] . .DR . . . . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [1,4] . .DR . . . . pxor %xmm2, %xmm2 +# CHECK-NEXT: [1,5] . . DR . . . . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [1,6] . . DR . . . . vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [1,7] . . DR. . . . vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [1,8] . . DR. . . . vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [2,0] . . DR . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [2,1] . . DR . . . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [2,2] . . DR . . . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [2,3] . . DR . . . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [2,4] . . .DR . . . pxor %xmm2, %xmm2 +# CHECK-NEXT: [2,5] . . .DR . . . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [2,6] . . . DR . . . vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [2,7] . . . DR . . . vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [2,8] . . . DR. . . vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [3,0] . . . DR. . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [3,1] . . . DR . . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [3,2] . . . DR . . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [3,3] . . . DR . . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [3,4] . . . DR . . pxor %xmm2, %xmm2 +# CHECK-NEXT: [3,5] . . . .DR . . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [3,6] . . . .DR . . vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [3,7] . . . . DR . . vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [3,8] . . . . DR . . vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [4,0] . . . . DR. . xorps %xmm0, %xmm0 +# CHECK-NEXT: [4,1] . . . . DR. . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [4,2] . . . . DR . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [4,3] . . . . DR . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [4,4] . . . . DR . pxor %xmm2, %xmm2 +# CHECK-NEXT: [4,5] . . . . DR . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [4,6] . . . . .DR. vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [4,7] . . . . .DR. vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [4,8] . . . . . DR vpxor %xmm3, %xmm3, %xmm5 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 5 0.0 0.0 0.0 xorps %xmm0, %xmm0 +# CHECK-NEXT: 1. 5 0.0 0.0 0.0 xorpd %xmm1, %xmm1 +# CHECK-NEXT: 2. 5 0.0 0.0 0.0 vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: 3. 5 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: 4. 5 0.0 0.0 0.0 pxor %xmm2, %xmm2 +# CHECK-NEXT: 5. 5 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 6. 5 0.0 0.0 0.0 vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: 7. 5 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: 8. 5 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm5 + diff --git a/llvm/tools/llvm-mca/InstrBuilder.cpp b/llvm/tools/llvm-mca/InstrBuilder.cpp index a745e1a6150..8a66a76605f 100644 --- a/llvm/tools/llvm-mca/InstrBuilder.cpp +++ b/llvm/tools/llvm-mca/InstrBuilder.cpp @@ -396,18 +396,22 @@ const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Then obtain the scheduling class information from the instruction. unsigned SchedClassID = MCDesc.getSchedClass(); - const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); + unsigned CPUID = SM.getProcessorID(); + + // Try to solve variant scheduling classes. + if (SchedClassID) { + while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant()) + SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID); + + if (!SchedClassID) + llvm::report_fatal_error("unable to resolve this variant class."); + } // Create a new empty descriptor. std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>(); - if (SCDesc.isVariant()) { - WithColor::warning() << "don't know how to model variant opcodes.\n"; - WithColor::note() << "assume 1 micro opcode.\n"; - ID->NumMicroOps = 1U; - } else { - ID->NumMicroOps = SCDesc.NumMicroOps; - } + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); + ID->NumMicroOps = SCDesc.NumMicroOps; if (MCDesc.isCall()) { // We don't correctly model calls. @@ -435,14 +439,24 @@ const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) { LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n'); // Now add the new descriptor. - Descriptors[Opcode] = std::move(ID); - return *Descriptors[Opcode]; + SchedClassID = MCDesc.getSchedClass(); + if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) { + Descriptors[MCI.getOpcode()] = std::move(ID); + return *Descriptors[MCI.getOpcode()]; + } + + VariantDescriptors[&MCI] = std::move(ID); + return *VariantDescriptors[&MCI]; } const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) { - if (Descriptors.find_as(MCI.getOpcode()) == Descriptors.end()) - return createInstrDescImpl(MCI); - return *Descriptors[MCI.getOpcode()]; + if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end()) + return *Descriptors[MCI.getOpcode()]; + + if (VariantDescriptors.find(&MCI) != VariantDescriptors.end()) + return *VariantDescriptors[&MCI]; + + return createInstrDescImpl(MCI); } std::unique_ptr<Instruction> diff --git a/llvm/tools/llvm-mca/InstrBuilder.h b/llvm/tools/llvm-mca/InstrBuilder.h index c22b7dcdbc6..146e917eb62 100644 --- a/llvm/tools/llvm-mca/InstrBuilder.h +++ b/llvm/tools/llvm-mca/InstrBuilder.h @@ -40,9 +40,10 @@ class InstrBuilder { llvm::SmallVector<uint64_t, 8> ProcResourceMasks; llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors; + llvm::DenseMap<const llvm::MCInst *, std::unique_ptr<const InstrDesc>> + VariantDescriptors; const InstrDesc &createInstrDescImpl(const llvm::MCInst &MCI); - InstrBuilder(const InstrBuilder &) = delete; InstrBuilder &operator=(const InstrBuilder &) = delete; diff --git a/llvm/tools/llvm-mca/InstructionInfoView.cpp b/llvm/tools/llvm-mca/InstructionInfoView.cpp index 76d63d21cb2..3b1e4dc8188 100644 --- a/llvm/tools/llvm-mca/InstructionInfoView.cpp +++ b/llvm/tools/llvm-mca/InstructionInfoView.cpp @@ -36,9 +36,16 @@ void InstructionInfoView::printView(raw_ostream &OS) const { for (unsigned I = 0, E = Instructions; I < E; ++I) { const MCInst &Inst = Source.getMCInstFromIndex(I); const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode()); - const MCSchedClassDesc &SCDesc = - *SM.getSchedClassDesc(MCDesc.getSchedClass()); + // Obtain the scheduling class information from the instruction. + unsigned SchedClassID = MCDesc.getSchedClass(); + unsigned CPUID = SM.getProcessorID(); + + // Try to solve variant scheduling classes. + while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant()) + SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID); + + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); unsigned NumMicroOpcodes = SCDesc.NumMicroOps; unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc); Optional<double> RThroughput = |