summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ScheduleZnver1.td
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-03-19 14:46:07 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-03-19 14:46:07 +0000
commit30c38c38497763d5660fde146e1185c0dbb082d5 (patch)
tree4ab35d4a334b1c9129e18e7ad03fdaa2700bd28e /llvm/lib/Target/X86/X86ScheduleZnver1.td
parent10fe9bc79eb38319cf6faf7fe4ffe131b23419af (diff)
downloadbcm5719-llvm-30c38c38497763d5660fde146e1185c0dbb082d5.tar.gz
bcm5719-llvm-30c38c38497763d5660fde146e1185c0dbb082d5.zip
[X86] Generalize schedule classes to support multiple stages
Currently the WriteResPair style multi-classes take a single pipeline stage and latency, this patch generalizes this to make it easier to create complex schedules with ResourceCycles and NumMicroOps be overriden from their defaults. This has already been done for the Jaguar scheduler to remove a number of custom schedule classes and adding it to the other x86 targets will make it much tidier as we add additional classes in the future to try and replace so many custom cases. I've converted some instructions but a lot of the models need a bit of cleanup after the patch has been committed - memory latencies not being consistent, the class not actually being used when we could remove some/all customs, etc. I'd prefer to keep this as NFC as possible so later patches can be smaller and target specific. Differential Revision: https://reviews.llvm.org/D44612 llvm-svn: 327855
Diffstat (limited to 'llvm/lib/Target/X86/X86ScheduleZnver1.td')
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver1.td105
1 files changed, 55 insertions, 50 deletions
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 4ad05f3a16e..c43dae41c7e 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -99,30 +99,41 @@ def : ReadAdvance<ReadAfterLd, 4>;
// b. addpd
// This multiclass is for folded loads for integer units.
multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
// adds 4 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let NumMicroOps = 2;
- let Latency = !add(Lat, 4);
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, 4);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
// This multiclass is for folded loads for floating point units.
multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
// adds 7 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let Latency = !add(Lat, 7);
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, 7);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
@@ -136,9 +147,10 @@ def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
-defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
-defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
-defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
+defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>;
+defm : ZnWriteResPair<WriteShift, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
@@ -154,67 +166,60 @@ def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
let ResourceCycles = [1, 4, 41];
}
-// IMUL
+// IMULH
def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
let Latency = 4;
}
-def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
- let Latency = 4;
-}
-
-def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
- let Latency = 8;
-}
// Floating point operations
def : WriteRes<WriteFStore, [ZnAGU]>;
def : WriteRes<WriteFMove, [ZnFPU]>;
def : WriteRes<WriteFLoad, [ZnAGU]> { let Latency = 8; }
-defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>;
-defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
-defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
-defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
-defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>;
-defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
+defm : ZnWriteResFpuPair<WriteFHAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU0], 5>;
+defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20>;
// Vector integer operations which uses FPU units
def : WriteRes<WriteVecStore, [ZnAGU]>;
def : WriteRes<WriteVecMove, [ZnFPU]>;
def : WriteRes<WriteVecLoad, [ZnAGU]> { let Latency = 8; }
-defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>;
-defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>;
+defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WritePHAdd, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
// Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>;
// AES Instructions.
-defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
// Following instructions with latency=100 are microcoded.
// We set long latency so as to block the entire pipeline.
-defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
//Microcoded Instructions
let Latency = 100 in {
OpenPOWER on IntegriCloud