summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86Schedule.td5
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBtVer2.td33
-rw-r--r--llvm/test/CodeGen/X86/sse-schedule.ll4
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s153
-rw-r--r--llvm/tools/llvm-mca/InstrBuilder.cpp40
-rw-r--r--llvm/tools/llvm-mca/InstrBuilder.h3
-rw-r--r--llvm/tools/llvm-mca/InstructionInfoView.cpp11
7 files changed, 230 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index ccee972c482..77e7f2e0f79 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -559,6 +559,11 @@ def SchedWriteFShuffleSizes
: X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
//===----------------------------------------------------------------------===//
+// Common MCInstPredicate definitions used by variant scheduling classes.
+
+def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>;
+
+//===----------------------------------------------------------------------===//
// Generic Processor Scheduler Models.
// IssueWidth is analogous to the number of decode units. Core and its
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 764d097e369..721088457a3 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -546,5 +546,36 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> {
let NumMicroOps = 37;
}
def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
-} // SchedModel
+///////////////////////////////////////////////////////////////////////////////
+// SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+// Vector XOR instructions that use the same register for both source
+// operands do not have a real dependency on the previous contents of the
+// register, and thus, do not have to wait before completing. They can be
+// optimized out at register renaming stage.
+// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
+// 15h Processors".
+// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// Section 21.8 [Dependency-breaking instructions].
+
+def JWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]>
+]>;
+
+def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr)>;
+
+def JWriteVZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
+ SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]>
+]>;
+
+def : InstRW<[JWriteVZeroIdiom], (instrs PXORrr, VPXORrr)>;
+
+} // SchedModel
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll
index 9207ba502e5..7f0cc213bbc 100644
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -6225,7 +6225,7 @@ define <4 x float> @test_fnop() nounwind {
;
; BTVER2-SSE-LABEL: test_fnop:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: xorps %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-SSE-NEXT: xorps %xmm0, %xmm0 # sched: [0:?]
; BTVER2-SSE-NEXT: #APP
; BTVER2-SSE-NEXT: nop # sched: [1:0.50]
; BTVER2-SSE-NEXT: #NO_APP
@@ -6233,7 +6233,7 @@ define <4 x float> @test_fnop() nounwind {
;
; BTVER2-LABEL: test_fnop:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [0:?]
; BTVER2-NEXT: #APP
; BTVER2-NEXT: nop # sched: [1:0.50]
; BTVER2-NEXT: #NO_APP
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s
new file mode 100644
index 00000000000..e51ce28d580
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -register-file-stats -iterations=5 < %s | FileCheck %s
+
+xorps %xmm0, %xmm0
+xorpd %xmm1, %xmm1
+vxorps %xmm2, %xmm2, %xmm2
+vxorpd %xmm1, %xmm1, %xmm1
+pxor %xmm2, %xmm2
+vpxor %xmm3, %xmm3, %xmm3
+
+vxorps %xmm4, %xmm4, %xmm5
+vxorpd %xmm1, %xmm1, %xmm3
+vpxor %xmm3, %xmm3, %xmm5
+
+# CHECK: Iterations: 5
+# CHECK-NEXT: Instructions: 45
+# CHECK-NEXT: Total Cycles: 24
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC: 1.88
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 0 - xorps %xmm0, %xmm0
+# CHECK-NEXT: 1 0 - xorpd %xmm1, %xmm1
+# CHECK-NEXT: 1 0 - vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 1 0 - vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 1 0 - pxor %xmm2, %xmm2
+# CHECK-NEXT: 1 0 - vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 1 0 - vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 1 0 - vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 1 0 - vpxor %xmm3, %xmm3, %xmm5
+
+# CHECK: Register File statistics:
+# CHECK-NEXT: Total number of mappings created: 0
+# CHECK-NEXT: Max number of mappings used: 0
+
+# CHECK: * Register File #1 -- JFpuPRF:
+# CHECK-NEXT: Number of physical registers: 72
+# CHECK-NEXT: Total number of mappings created: 0
+# CHECK-NEXT: Max number of mappings used: 0
+
+# CHECK: * Register File #2 -- JIntegerPRF:
+# CHECK-NEXT: Number of physical registers: 64
+# CHECK-NEXT: Total number of mappings created: 0
+# CHECK-NEXT: Max number of mappings used: 0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - - - - - - - - xorps %xmm0, %xmm0
+# CHECK-NEXT: - - - - - - - - - - - - - - xorpd %xmm1, %xmm1
+# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: - - - - - - - - - - - - - - vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: - - - - - - - - - - - - - - pxor %xmm2, %xmm2
+# CHECK-NEXT: - - - - - - - - - - - - - - vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: - - - - - - - - - - - - - - vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: - - - - - - - - - - - - - - vpxor %xmm3, %xmm3, %xmm5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123
+
+# CHECK: [0,0] DR . . . . . xorps %xmm0, %xmm0
+# CHECK-NEXT: [0,1] DR . . . . . xorpd %xmm1, %xmm1
+# CHECK-NEXT: [0,2] .DR . . . . . vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,3] .DR . . . . . vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,4] . DR . . . . . pxor %xmm2, %xmm2
+# CHECK-NEXT: [0,5] . DR . . . . . vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,6] . DR. . . . . vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,7] . DR. . . . . vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,8] . DR . . . . vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [1,0] . DR . . . . xorps %xmm0, %xmm0
+# CHECK-NEXT: [1,1] . DR . . . . xorpd %xmm1, %xmm1
+# CHECK-NEXT: [1,2] . DR . . . . vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [1,3] . .DR . . . . vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [1,4] . .DR . . . . pxor %xmm2, %xmm2
+# CHECK-NEXT: [1,5] . . DR . . . . vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [1,6] . . DR . . . . vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [1,7] . . DR. . . . vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [1,8] . . DR. . . . vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [2,0] . . DR . . . xorps %xmm0, %xmm0
+# CHECK-NEXT: [2,1] . . DR . . . xorpd %xmm1, %xmm1
+# CHECK-NEXT: [2,2] . . DR . . . vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [2,3] . . DR . . . vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [2,4] . . .DR . . . pxor %xmm2, %xmm2
+# CHECK-NEXT: [2,5] . . .DR . . . vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [2,6] . . . DR . . . vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [2,7] . . . DR . . . vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [2,8] . . . DR. . . vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [3,0] . . . DR. . . xorps %xmm0, %xmm0
+# CHECK-NEXT: [3,1] . . . DR . . xorpd %xmm1, %xmm1
+# CHECK-NEXT: [3,2] . . . DR . . vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [3,3] . . . DR . . vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [3,4] . . . DR . . pxor %xmm2, %xmm2
+# CHECK-NEXT: [3,5] . . . .DR . . vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [3,6] . . . .DR . . vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [3,7] . . . . DR . . vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [3,8] . . . . DR . . vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [4,0] . . . . DR. . xorps %xmm0, %xmm0
+# CHECK-NEXT: [4,1] . . . . DR. . xorpd %xmm1, %xmm1
+# CHECK-NEXT: [4,2] . . . . DR . vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [4,3] . . . . DR . vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [4,4] . . . . DR . pxor %xmm2, %xmm2
+# CHECK-NEXT: [4,5] . . . . DR . vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [4,6] . . . . .DR. vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [4,7] . . . . .DR. vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [4,8] . . . . . DR vpxor %xmm3, %xmm3, %xmm5
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 5 0.0 0.0 0.0 xorps %xmm0, %xmm0
+# CHECK-NEXT: 1. 5 0.0 0.0 0.0 xorpd %xmm1, %xmm1
+# CHECK-NEXT: 2. 5 0.0 0.0 0.0 vxorps %xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 3. 5 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 4. 5 0.0 0.0 0.0 pxor %xmm2, %xmm2
+# CHECK-NEXT: 5. 5 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 6. 5 0.0 0.0 0.0 vxorps %xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 7. 5 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 8. 5 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm5
+
diff --git a/llvm/tools/llvm-mca/InstrBuilder.cpp b/llvm/tools/llvm-mca/InstrBuilder.cpp
index a745e1a6150..8a66a76605f 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.cpp
+++ b/llvm/tools/llvm-mca/InstrBuilder.cpp
@@ -396,18 +396,22 @@ const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
// Then obtain the scheduling class information from the instruction.
unsigned SchedClassID = MCDesc.getSchedClass();
- const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+ unsigned CPUID = SM.getProcessorID();
+
+ // Try to solve variant scheduling classes.
+ if (SchedClassID) {
+ while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+ SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
+
+ if (!SchedClassID)
+ llvm::report_fatal_error("unable to resolve this variant class.");
+ }
// Create a new empty descriptor.
std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
- if (SCDesc.isVariant()) {
- WithColor::warning() << "don't know how to model variant opcodes.\n";
- WithColor::note() << "assume 1 micro opcode.\n";
- ID->NumMicroOps = 1U;
- } else {
- ID->NumMicroOps = SCDesc.NumMicroOps;
- }
+ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+ ID->NumMicroOps = SCDesc.NumMicroOps;
if (MCDesc.isCall()) {
// We don't correctly model calls.
@@ -435,14 +439,24 @@ const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
// Now add the new descriptor.
- Descriptors[Opcode] = std::move(ID);
- return *Descriptors[Opcode];
+ SchedClassID = MCDesc.getSchedClass();
+ if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
+ Descriptors[MCI.getOpcode()] = std::move(ID);
+ return *Descriptors[MCI.getOpcode()];
+ }
+
+ VariantDescriptors[&MCI] = std::move(ID);
+ return *VariantDescriptors[&MCI];
}
const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
- if (Descriptors.find_as(MCI.getOpcode()) == Descriptors.end())
- return createInstrDescImpl(MCI);
- return *Descriptors[MCI.getOpcode()];
+ if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end())
+ return *Descriptors[MCI.getOpcode()];
+
+ if (VariantDescriptors.find(&MCI) != VariantDescriptors.end())
+ return *VariantDescriptors[&MCI];
+
+ return createInstrDescImpl(MCI);
}
std::unique_ptr<Instruction>
diff --git a/llvm/tools/llvm-mca/InstrBuilder.h b/llvm/tools/llvm-mca/InstrBuilder.h
index c22b7dcdbc6..146e917eb62 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.h
+++ b/llvm/tools/llvm-mca/InstrBuilder.h
@@ -40,9 +40,10 @@ class InstrBuilder {
llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+ llvm::DenseMap<const llvm::MCInst *, std::unique_ptr<const InstrDesc>>
+ VariantDescriptors;
const InstrDesc &createInstrDescImpl(const llvm::MCInst &MCI);
-
InstrBuilder(const InstrBuilder &) = delete;
InstrBuilder &operator=(const InstrBuilder &) = delete;
diff --git a/llvm/tools/llvm-mca/InstructionInfoView.cpp b/llvm/tools/llvm-mca/InstructionInfoView.cpp
index 76d63d21cb2..3b1e4dc8188 100644
--- a/llvm/tools/llvm-mca/InstructionInfoView.cpp
+++ b/llvm/tools/llvm-mca/InstructionInfoView.cpp
@@ -36,9 +36,16 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
for (unsigned I = 0, E = Instructions; I < E; ++I) {
const MCInst &Inst = Source.getMCInstFromIndex(I);
const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
- const MCSchedClassDesc &SCDesc =
- *SM.getSchedClassDesc(MCDesc.getSchedClass());
+ // Obtain the scheduling class information from the instruction.
+ unsigned SchedClassID = MCDesc.getSchedClass();
+ unsigned CPUID = SM.getProcessorID();
+
+ // Try to solve variant scheduling classes.
+ while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+ SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID);
+
+ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
Optional<double> RThroughput =
OpenPOWER on IntegriCloud