summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/PowerPC/P9InstrResources.td662
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFormats.td1
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td1
-rw-r--r--llvm/lib/Target/PowerPC/PPCScheduleP9.td105
4 files changed, 503 insertions, 266 deletions
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index aea022f8876..f7310b54448 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -12,11 +12,29 @@
// is listed here. Instructions in this file belong to itinerary classes that
// have instructions with different resource requirements.
//
+// The makeup of the P9 CPU is modeled as follows:
+// - Each CPU is made up of two superslices.
+// - Each superslice is made up of two slices. Therefore, there are 4 slices
+// for each CPU.
+// - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
+// - Each CPU has:
+// - One CY (Crypto) unit P9_CY_*
+// - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_*
+// - Two PM (Permute) units. One on each superslice. P9_PM_*
+// - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_*
+// - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_*
+// - Four DP (Floating Point) units. One on each slice. P9_DP_*
+// This also includes fixed point multiply add.
+// - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_*
+// - Four Load/Store Queues. P9_LS_*
+// - Each set of instructions will require a number of these resources.
//===----------------------------------------------------------------------===//
-
+// Two cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDCUW,
VADDUBM,
@@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VAND,
VANDC,
VCMPEQUB,
- VCMPEQUBo,
VCMPEQUD,
- VCMPEQUDo,
VCMPEQUH,
- VCMPEQUHo,
VCMPEQUW,
- VCMPEQUWo,
- VCMPGTSB,
- VCMPGTSBo,
- VCMPGTSD,
- VCMPGTSDo,
- VCMPGTSH,
- VCMPGTSHo,
- VCMPGTSW,
- VCMPGTSWo,
- VCMPGTUB,
- VCMPGTUBo,
- VCMPGTUD,
- VCMPGTUDo,
- VCMPGTUH,
- VCMPGTUHo,
- VCMPGTUW,
- VCMPGTUWo,
VCMPNEB,
- VCMPNEBo,
VCMPNEH,
- VCMPNEHo,
VCMPNEW,
- VCMPNEWo,
VCMPNEZB,
- VCMPNEZBo,
VCMPNEZH,
- VCMPNEZHo,
VCMPNEZW,
- VCMPNEZWo,
VEQV,
VEXTSB2D,
VEXTSB2W,
VEXTSH2D,
VEXTSH2W,
VEXTSW2D,
+ VRLB,
+ VRLD,
+ VRLDMI,
+ VRLDNM,
+ VRLH,
+ VRLW,
+ VRLWMI,
+ VRLWNM,
+ VSRAB,
+ VSRAD,
+ VSRAH,
+ VSRAW,
+ VSRB,
+ VSRD,
+ VSRH,
+ VSRW,
+ VSLB,
+ VSLD,
+ VSLH,
+ VSLW,
VMRGEW,
VMRGOW,
VNAND,
@@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
VORC,
VPOPCNTB,
VPOPCNTH,
- VPOPCNTW,
VSEL,
- VSUBCUW,
VSUBUBM,
VSUBUDM,
VSUBUHM,
@@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XVNEGDP,
XVNEGSP,
XVXEXPDP,
+ XVIEXPSP,
+ XVXEXPSP,
XXLAND,
XXLANDC,
XXLEQV,
@@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
XXLORf,
XXLORC,
XXLXOR,
- XXSEL
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
+ XXSEL,
XSABSQP,
XSCPSGNQP,
XSIEXPQP,
XSNABSQP,
XSNEGQP,
- XSXEXPQP,
- XSABSDP,
- XSCPSGNDP,
- XSIEXPDP,
+ XSXEXPQP
+)>;
+
+// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ FCMPUS,
+ FCMPUD,
+ XSTSTDCDP,
+ XSTSTDCSP
+)>;
+
+// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ XSMAXCDP,
+ XSMAXDP,
+ XSMAXJDP,
+ XSMINCDP,
+ XSMINDP,
+ XSMINJDP,
+ XSTDIVDP,
+ XSTSQRTDP,
+ XSCMPEQDP,
+ XSCMPEXPDP,
+ XSCMPGEDP,
+ XSCMPGTDP,
+ XSCMPODP,
+ XSCMPUDP,
+ XSXSIGDP,
+ XSCVSPDPN
+)>;
+
+// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ (instrs
+ ADDIStocHA,
+ ADDItocL,
+ MCRF,
+ MCRXRX,
+ SLD,
+ SRD,
+ SRAD,
+ SRADI,
+ RLDIC,
XSNABSDP,
+ XSXEXPDP,
+ XSABSDP,
XSNEGDP,
- XSXEXPDP
+ XSCPSGNDP
)>;
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
+// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ RLDCL,
+ RLDCR,
+ RLDIMI,
+ RLDICL,
+ RLDICR,
+ RLDICL_32_64,
+ XSIEXPDP,
+ FMR,
+ FABSD,
+ FABSS,
+ FNABSD,
+ FNABSS,
+ FNEGD,
+ FNEGS,
+ FCPSGND,
+ FCPSGNS
+)>;
+// Three cycle ALU vector operation that uses an entire superslice.
+// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VBPERMD,
+ VABSDUB,
+ VABSDUH,
+ VABSDUW,
+ VADDUBS,
+ VADDUHS,
+ VADDUWS,
+ VAVGSB,
+ VAVGSH,
+ VAVGSW,
+ VAVGUB,
+ VAVGUH,
+ VAVGUW,
+ VCMPEQFP,
+ VCMPEQFPo,
+ VCMPGEFP,
+ VCMPGEFPo,
+ VCMPBFP,
+ VCMPBFPo,
+ VCMPGTFP,
+ VCMPGTFPo,
+ VCLZB,
+ VCLZD,
+ VCLZH,
+ VCLZW,
+ VCTZB,
+ VCTZD,
+ VCTZH,
+ VCTZW,
+ VADDSBS,
+ VADDSHS,
+ VADDSWS,
+ VMINFP,
VMINSB,
VMINSD,
VMINSH,
@@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
VMINUD,
VMINUH,
VMINUW,
+ VMAXFP,
+ VMAXSB,
+ VMAXSD,
+ VMAXSH,
+ VMAXSW,
+ VMAXUB,
+ VMAXUD,
+ VMAXUH,
+ VMAXUW,
+ VPOPCNTW,
VPOPCNTD,
VPRTYBD,
VPRTYBW,
- VRLB,
- VRLD,
- VRLDMI,
- VRLDNM,
- VRLH,
- VRLW,
- VRLWMI,
- VRLWNM,
VSHASIGMAD,
VSHASIGMAW,
- VSLB,
- VSLD,
- VSLH,
- VSLW,
- VSRAB,
- VSRAD,
- VSRAH,
- VSRAW,
- VSRB,
- VSRD,
- VSRH,
- VSRW,
VSUBSBS,
VSUBSHS,
VSUBSWS,
VSUBUBS,
VSUBUHS,
VSUBUWS,
- XSCMPEQDP,
- XSCMPEXPDP,
- XSCMPGEDP,
- XSCMPGTDP,
- XSCMPODP,
- XSCMPUDP,
- XSCVSPDPN,
- XSMAXCDP,
- XSMAXDP,
- XSMAXJDP,
- XSMINCDP,
- XSMINDP,
- XSMINJDP,
- XSTDIVDP,
- XSTSQRTDP,
- XSTSTDCDP,
- XSTSTDCSP,
- XSXSIGDP,
+ VSUBCUW,
+ VCMPGTSB,
+ VCMPGTSBo,
+ VCMPGTSD,
+ VCMPGTSDo,
+ VCMPGTSH,
+ VCMPGTSHo,
+ VCMPGTSW,
+ VCMPGTSWo,
+ VCMPGTUB,
+ VCMPGTUBo,
+ VCMPGTUD,
+ VCMPGTUDo,
+ VCMPGTUH,
+ VCMPGTUHo,
+ VCMPGTUW,
+ VCMPGTUWo,
+ VCMPNEBo,
+ VCMPNEHo,
+ VCMPNEWo,
+ VCMPNEZBo,
+ VCMPNEZHo,
+ VCMPNEZWo,
+ VCMPEQUBo,
+ VCMPEQUDo,
+ VCMPEQUHo,
+ VCMPEQUWo,
XVCMPEQDP,
XVCMPEQDPo,
XVCMPEQSP,
@@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVCMPGTDPo,
XVCMPGTSP,
XVCMPGTSPo,
- XVIEXPSP,
XVMAXDP,
XVMAXSP,
XVMINDP,
@@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
XVTSQRTSP,
XVTSTDCDP,
XVTSTDCSP,
- XVXEXPSP,
XVXSIGDP,
XVXSIGSP
)>;
-def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
- (instrs
- VABSDUB,
- VABSDUH,
- VABSDUW,
- VADDSBS,
- VADDSHS,
- VADDSWS,
- VADDUBS,
- VADDUHS,
- VADDUWS,
- VAVGSB,
- VAVGSH,
- VAVGSW,
- VAVGUB,
- VAVGUH,
- VAVGUW,
- VBPERMD,
- VCLZB,
- VCLZD,
- VCLZH,
- VCLZW,
- VCMPBFP,
- VCMPBFPo,
- VCMPGTFP,
- VCMPGTFPo,
- VCTZB,
- VCTZD,
- VCTZH,
- VCTZW,
- VMAXFP,
- VMAXSB,
- VMAXSD,
- VMAXSH,
- VMAXSW,
- VMAXUB,
- VMAXUD,
- VMAXUH,
- VMAXUW,
- VMINFP,
- VCMPEQFP,
- VCMPEQFPo,
- VCMPGEFP,
- VCMPGEFPo
-)>;
-
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 7 cycle DP vector operation that uses an entire superslice.
+// Uses both DP units (the even DPE and odd DPO units), two pipelines
+// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
VADDFP,
VCTSXS,
@@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
VSUMSWS
)>;
+// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+// dispatch units for the superslice.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
+ FRSP,
+ FRIND,
+ FRINS,
+ FRIPD,
+ FRIPS,
+ FRIZD,
+ FRIZS,
+ FRIMD,
+ FRIMS,
+ FRE,
+ FRES,
+ FRSQRTE,
+ FRSQRTES,
+ FMADDS,
+ FMADD,
+ FMSUBS,
+ FMSUB,
+ FNMADDS,
+ FNMADD,
+ FNMSUBS,
+ FNMSUB,
+ FSELD,
+ FSELS,
+ FADDS,
+ FMULS,
+ FMUL,
+ FSUBS,
+ FCFID,
+ FCTID,
+ FCTIDZ,
+ FCFIDU,
+ FCFIDS,
+ FCFIDUS,
+ FCTIDUZ,
+ FCTIWUZ,
+ FCTIW,
+ FCTIWZ,
XSMADDADP,
XSMADDASP,
XSMADDMDP,
@@ -389,7 +495,7 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
XSNMSUBMSP
)>;
-
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
XSADDDP,
@@ -397,8 +503,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPHP,
XSCVDPSP,
XSCVDPSXDS,
+ XSCVDPSXDSs,
XSCVDPSXWS,
XSCVDPUXDS,
+ XSCVDPUXDSs,
XSCVDPUXWS,
XSCVHPDP,
XSCVSPDP,
@@ -421,7 +529,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSCVDPSPN
)>;
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
VBPERMQ,
VCLZLSBB,
@@ -469,7 +580,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
VSLO,
VSLV,
VSPLTB,
+ VSPLTBs,
VSPLTH,
+ VSPLTHs,
VSPLTISB,
VSPLTISH,
VSPLTISW,
@@ -498,6 +611,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XXSLDWI,
XXSPLTIB,
XXSPLTW,
+ XXSPLTWs,
+ XXPERMDI,
+ XXPERMDIs,
VADDCUQ,
VADDECUQ,
VADDEUQM,
@@ -517,7 +633,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
XSXSIGQP
)>;
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSADDQP,
XSADDQPO,
@@ -536,7 +655,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSSUBQPO
)>;
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSMADDQP,
XSMADDQPO,
@@ -550,45 +672,56 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
XSNMSUBQPO
)>;
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSDIVQP,
XSDIVQPO
)>;
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
XSSQRTQP,
XSSQRTQPO
)>;
-// Load Operation in IIC_LdStLFD
-
+// 5 Cycle load uses a single slice.
def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
LXSDX,
LXVD2X,
LXSIWZX,
LXV,
- LXSD
+ LXVX,
+ LXSD,
+ DFLOADf64
)>;
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle load uses a single slice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
(instrs
- LFIWZX,
- LFDX,
- LFD
+ COPY
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
+// superslice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- LXSSPX,
- LXSIWAX,
- LXSSP
+ LFIWZX,
+ LFDX,
+ LFD
)>;
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked Restricted Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 6 dispatches are required as this is both cracked and restricted.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFIWAX,
@@ -596,14 +729,35 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
LFS
)>;
-def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
+// operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ LXSSPX,
+ LXSIWAX,
+ LXSSP,
+ DFLOADf32
+)>;
+
+// Cracked Load that requires the PM resource.
+// Since the Load and the PM cannot be done at the same time the latencies are
+// added. Requires 8 cycles.
+// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
+// as well as 3 dispatches for the PM. The Load requires the remaining 2
+// dispatches.
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LXVDSX,
+ LXVWSX,
LXVW4X
)>;
-// Store Operations in IIC_LdStSTFD.
-
+// Single slice Restricted store operation. The restricted operation requires
+// all three dispatches for the superslice.
def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
STFS,
@@ -613,74 +767,83 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
STFDX,
STXSDX,
STXSSPX,
- STXSIWX
+ STXSIWX,
+ DFSTOREf32,
+ DFSTOREf64
)>;
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+// Store operation that requires the whole superslice.
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
STXVD2X,
STXVW4X
)>;
-// Divide Operations in IIC_IntDivW, IIC_IntDivD.
-
-def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVW,
- DIVWU
+ DIVWU,
+ MODSW
)>;
-def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVWE,
DIVD,
DIVWEU,
- DIVDU
+ DIVDU,
+ MODSD,
+ MODUD,
+ MODUW
)>;
-def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDE,
DIVDEU
)>;
-def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVWEo,
DIVWEUo
)>;
-def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+// and one full superslice for the DIV operation since there is only one DIV
+// per superslice. Latency of DIV plus ALU is 42.
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
DIVDEo,
DIVDEUo
)>;
-// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- SLD,
- SRD,
- SRAD,
- SRADI,
- RLDIC
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- (instrs
- RLDCL,
- RLDCR,
- RLDIMI,
- RLDICL,
- RLDICR,
- RLDICL_32_64
-)>;
-
// CR access instructions in _BrMCR, IIC_BrMCRX.
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
@@ -690,13 +853,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
MTCRF8
)>;
-def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- (instrs
- MCRF,
- MCRXRX
-)>;
-
-def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
MCRFS
@@ -704,93 +866,57 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- FDIV,
- XSDIVDP
+ FDIV
)>;
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- FDIVS,
- XSDIVSP
-)>;
-
-def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
- (instrs
- XVDIVSP
+ XSDIVDP
)>;
-def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
- XVDIVDP
+ FDIVS
)>;
-// FP Instructions in IIC_FPGeneral, IIC_FPFused
-
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- FRSP,
- FRIND,
- FRINS,
- FRIPD,
- FRIPS,
- FRIZD,
- FRIZS,
- FRIMD,
- FRIMS,
- FRE,
- FRES,
- FRSQRTE,
- FRSQRTES,
- FMADDS,
- FMADD,
- FMSUBS,
- FMSUB,
- FNMADDS,
- FNMADD,
- FNMSUBS,
- FNMSUB,
- FSELD,
- FSELS,
- FADDS,
- FMULS,
- FMUL,
- FSUBS,
- FCFID,
- FCTID,
- FCTIDZ,
- FCFIDU,
- FCFIDS,
- FCFIDUS,
- FCTIDUZ,
- FCTIWUZ,
- FCTIW,
- FCTIWZ
+ XSDIVSP
)>;
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 24 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FMR,
- FABSD,
- FABSS,
- FNABSD,
- FNABSS,
- FNEGD,
- FNEGS,
- FCPSGND,
- FCPSGNS
+ XVDIVSP
)>;
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// superslice.
+def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C, DISP_1C],
(instrs
- FCMPUS,
- FCMPUD
+ XVDIVDP
)>;
// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
-def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+// Instruction cracked into three pieces. One Load and two ALU operations.
+// The Load and one of the ALU ops cannot be run at the same time and so the
+// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
+// Both the load and the ALU that depends on it are restricted and so they take
+// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -799,10 +925,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
LFSUX
)>;
-def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+// the load and so it can be run at the same time as the load. The load is also
+// restricted. 3 dispatches are from the restricted load while the other two
+// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
+// is required for the ALU.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
LFDU,
LFDUX
)>;
+// Crypto Instructions
+
+// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+ (instrs
+ VPMSUMB,
+ VPMSUMD,
+ VPMSUMH,
+ VPMSUMW,
+ VCIPHER,
+ VCIPHERLAST,
+ VNCIPHER,
+ VNCIPHERLAST,
+ VSBOX
+)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 73aee3bcdd5..f2845415ecb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -2101,4 +2101,5 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
let PPC64 = 0;
let Pattern = pattern;
let Inst{31-0} = 0;
+ let hasNoSchedulingInfo = 1;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 9f2ee521fd5..8fc9574dca4 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3951,6 +3951,7 @@ class PPCAsmPseudo<string asm, dag iops>
let AsmString = asm;
let isAsmParserOnly = 1;
let isPseudo = 1;
+ let hasNoSchedulingInfo = 1;
}
def : InstAlias<"sc", (SC 0)>;
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index a01995a629c..6830488deb2 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -22,7 +22,9 @@ def P9Model : SchedMachineModel {
// Try to make sure we have at least 10 dispatch groups in a loop.
let LoopMicroOpBufferSize = 60;
- let CompleteModel = 0;
+ let CompleteModel = 1;
+
+ let UnsupportedFeatures = [HasQPX];
}
@@ -68,6 +70,10 @@ let SchedModel = P9Model in {
def LS : ProcResource<4>;
def PM : ProcResource<2>;
def DFU : ProcResource<1>;
+ def BR : ProcResource<1> {
+ let BufferSize = 16;
+ }
+ def CY : ProcResource<1>;
def TestGroup : ProcResGroup<[ALU, DP]>;
@@ -145,6 +151,10 @@ let SchedModel = P9Model in {
let Latency = 6;
}
+ def P9_DIV_12C : SchedWriteRes<[DIV]> {
+ let Latency = 12;
+ }
+
def P9_DIV_16C_8 : SchedWriteRes<[DIV]> {
let ResourceCycles = [8];
let Latency = 16;
@@ -190,6 +200,16 @@ let SchedModel = P9Model in {
let Latency = 24;
}
+ def P9_DPO_24C_8 : SchedWriteRes<[DPO]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
+ def P9_DPE_24C_8 : SchedWriteRes<[DPE]> {
+ let ResourceCycles = [8];
+ let Latency = 24;
+ }
+
def P9_DP_26C_5 : SchedWriteRes<[DP]> {
let ResourceCycles = [5];
let Latency = 22;
@@ -205,6 +225,16 @@ let SchedModel = P9Model in {
let Latency = 33;
}
+ def P9_DPE_33C_8 : SchedWriteRes<[DPE]> {
+ let ResourceCycles = [8];
+ let Latency = 33;
+ }
+
+ def P9_DPO_33C_8 : SchedWriteRes<[DPO]> {
+ let ResourceCycles = [8];
+ let Latency = 33;
+ }
+
def P9_DP_36C_10 : SchedWriteRes<[DP]> {
let ResourceCycles = [10];
let Latency = 36;
@@ -248,11 +278,25 @@ let SchedModel = P9Model in {
let Latency = 76;
let ResourceCycles = [62];
}
+
+ def P9_BR_2C : SchedWriteRes<[BR]> {
+ let Latency = 2;
+ }
+
+ def P9_BR_5C : SchedWriteRes<[BR]> {
+ let Latency = 5;
+ }
+
+ def P9_CY_6C : SchedWriteRes<[CY]> {
+ let Latency = 6;
+ }
+
// ***************** WriteSeq Definitions *****************
def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
+ def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
@@ -260,19 +304,32 @@ let SchedModel = P9Model in {
// ***************** Defining Itinerary Class Resources *****************
+ // The following itineraries are fully covered by the InstRW definitions in
+ // P9InstrResources.td so aren't listed here.
+ // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU,
+ // IIC_LdStLFDUX
+
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_IntSimple, IIC_IntGeneral]>;
+ [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID,
+ IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD,
+ IIC_SprRFI]>;
+
+ def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_IntTrapW]>;
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
[IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
+ def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>;
+
def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>;
+ [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>;
def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
- [IIC_LdStLoad, IIC_LdStLD]>;
+ [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>;
def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -300,12 +357,18 @@ let SchedModel = P9Model in {
def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
[IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
+ def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF,
+ IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC,
+ IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>;
+
def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
[IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
- [IIC_LdStSTDU, IIC_LdStSTDUX]>;
+ [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG,
+ IIC_SprTLBIA, IIC_SprTLBIE]>;
def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -315,20 +378,44 @@ let SchedModel = P9Model in {
[IIC_BrCR, IIC_IntMTFSB0]>;
def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>;
+ IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+ DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>;
+
+ def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>;
+ def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>;
// This class should be broken down to instruction level, once some missing
// info is obtained.
def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
- def : ItinRW<[P9_DP_7C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>;
+ def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE,
+ IIC_SprTLBIEL]>;
+
+ // IIC_VecFP is added here although many instructions with that itinerary
+ // use very different resources. It would appear that instructions were
+ // given that itinerary rather carelessly over time. Specific instructions
+ // that use different resources are listed in various InstrRW classes.
+ def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+ [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>;
+
+ def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+ DISP_1C, DISP_1C], [IIC_VecFPCompare]>;
+
+ def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+ [IIC_VecPerm]>;
def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
+ def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C],
+ [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB,
+ IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>;
+
+ def : ItinRW<[], [IIC_SprSTOP]>;
+
include "P9InstrResources.td"
}
OpenPOWER on IntegriCloud