summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SISchedule.td
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SISchedule.td')
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td31
1 files changed, 31 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index e3066df12d0..824d1aeb0df 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -24,6 +24,9 @@ def WriteSMEM : SchedWrite;
def WriteVMEM : SchedWrite;
def WriteBarrier : SchedWrite;
+def MIVGPRRead : SchedRead;
+def MIMFMARead : SchedRead;
+
// Vector ALU instructions
def Write32Bit : SchedWrite;
def WriteQuarterRate32 : SchedWrite;
@@ -43,6 +46,11 @@ def WriteDoubleCvt : SchedWrite;
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
+// mAI multipass instructions.
+def Write2PassMAI : SchedWrite;
+def Write8PassMAI : SchedWrite;
+def Write16PassMAI : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -97,6 +105,11 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
class HWVALUWriteRes<SchedWrite write, int latency> :
HWWriteRes<write, [HWVALU], latency>;
+def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>;
+
+def MIReadVGPR : SchedReadVariant<[
+ SchedVar<PredMIReadVGPR, [MIVGPRRead]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
// The latency numbers are taken from AMD Accelerated Parallel Processing
// guide. They may not be accurate.
@@ -115,6 +128,24 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ def : HWVALUWriteRes<Write2PassMAI, 2>;
+ def : HWVALUWriteRes<Write8PassMAI, 8>;
+ def : HWVALUWriteRes<Write16PassMAI, 16>;
+
+ def : ReadAdvance<MIVGPRRead, -2>;
+ def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
+
+ // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
+ // sense to model because its register setup is huge. In particular if we
+ // properly model read advanice as -2 for a vgpr read it will result in a
+ // bad scheduling of acc writes before that mfma. To avoid it we would
+ // need to consume 2 or 4 more vgprs to be initialized before the acc
+ // write sequence. Just assume worst case here.
+ def : ReadAdvance<MIMFMARead, -4>;
+
+ def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+ def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}
def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
OpenPOWER on IntegriCloud