diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/MachineScheduler.cpp | 95 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 6 |
5 files changed, 82 insertions, 45 deletions
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 9a7cae6e71f..b7f73fd731b 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -71,8 +71,9 @@ static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden, static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, cl::desc("Enable cyclic critical path analysis."), cl::init(true)); -static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, - cl::desc("Enable load clustering."), cl::init(true)); +static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, + cl::desc("Enable memop clustering."), + cl::init(true)); // Experimental heuristics static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, @@ -1351,64 +1352,80 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { } //===----------------------------------------------------------------------===// -// LoadClusterMutation - DAG post-processing to cluster loads. +// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. //===----------------------------------------------------------------------===// namespace { /// \brief Post-process the DAG to create cluster edges between neighboring -/// loads. -class LoadClusterMutation : public ScheduleDAGMutation { - struct LoadInfo { +/// loads or between neighboring stores. +class BaseMemOpClusterMutation : public ScheduleDAGMutation { + struct MemOpInfo { SUnit *SU; unsigned BaseReg; int64_t Offset; - LoadInfo(SUnit *su, unsigned reg, int64_t ofs) - : SU(su), BaseReg(reg), Offset(ofs) {} + MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) + : SU(su), BaseReg(reg), Offset(ofs) {} - bool operator<(const LoadInfo &RHS) const { + bool operator<(const MemOpInfo&RHS) const { return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset); } }; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + bool IsLoad; + public: - LoadClusterMutation(const TargetInstrInfo *tii, - const TargetRegisterInfo *tri) - : TII(tii), TRI(tri) {} + BaseMemOpClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri, bool IsLoad) + : TII(tii), TRI(tri), IsLoad(IsLoad) {} void apply(ScheduleDAGInstrs *DAGInstrs) override; + protected: - void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); + void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG); +}; + +class StoreClusterMutation : public BaseMemOpClusterMutation { +public: + StoreClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri) + : BaseMemOpClusterMutation(tii, tri, false) {} +}; + +class LoadClusterMutation : public BaseMemOpClusterMutation { +public: + LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) + : BaseMemOpClusterMutation(tii, tri, true) {} }; } // anonymous -void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, - ScheduleDAGMI *DAG) { - SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; - for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { - SUnit *SU = Loads[Idx]; +void BaseMemOpClusterMutation::clusterNeighboringMemOps( + ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { + SmallVector<MemOpInfo, 32> MemOpRecords; + for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) { + SUnit *SU = MemOps[Idx]; unsigned BaseReg; int64_t Offset; if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) - LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); + MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset)); } - if (LoadRecords.size() < 2) + if (MemOpRecords.size() < 2) return; - std::sort(LoadRecords.begin(), LoadRecords.end()); + + std::sort(MemOpRecords.begin(), MemOpRecords.end()); unsigned ClusterLength = 1; - for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { - if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { + for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { + if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) { ClusterLength = 1; continue; } - SUnit *SUa = LoadRecords[Idx].SU; - SUnit *SUb = LoadRecords[Idx+1].SU; - if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) + SUnit *SUa = MemOpRecords[Idx].SU; + SUnit *SUb = MemOpRecords[Idx+1].SU; + if (TII->shouldClusterMemOps(SUa->getInstr(), SUb->getInstr(), ClusterLength) && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - - DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" + DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); // Copy successor edges from SUa to SUb. Interleaving computation // dependent on SUa can prevent load combining due to register reuse. @@ -1429,17 +1446,20 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, } /// \brief Callback from DAG postProcessing to create cluster edges for loads. -void LoadClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); // Map DAG NodeNum to store chain ID. DenseMap<unsigned, unsigned> StoreChainIDs; - // Map each store chain to a set of dependent loads. + // Map each store chain to a set of dependent MemOps. SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { SUnit *SU = &DAG->SUnits[Idx]; - if (!SU->getInstr()->mayLoad()) + if ((IsLoad && !SU->getInstr()->mayLoad()) || + (!IsLoad && !SU->getInstr()->mayStore())) continue; + unsigned ChainPredID = DAG->SUnits.size(); for (SUnit::const_pred_iterator PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { @@ -1449,7 +1469,7 @@ void LoadClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } } // Check if this chain-like pred has been seen - // before. ChainPredID==MaxNodeID for loads at the top of the schedule. + // before. ChainPredID==MaxNodeID at the top of the schedule. unsigned NumChains = StoreChainDependents.size(); std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); @@ -1457,9 +1477,10 @@ void LoadClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { StoreChainDependents.resize(NumChains + 1); StoreChainDependents[Result.first->second].push_back(SU); } + // Iterate over the store chains. for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) - clusterNeighboringLoads(StoreChainDependents[Idx], DAG); + clusterNeighboringMemOps(StoreChainDependents[Idx], DAG); } //===----------------------------------------------------------------------===// @@ -3054,8 +3075,12 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) { // data and pass it to later mutations. Have a single mutation that gathers // the interesting nodes in one pass. DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI)); - if (EnableLoadCluster && DAG->TII->enableClusterLoads()) - DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); + if (EnableMemOpCluster) { + if (DAG->TII->enableClusterLoads()) + DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); + if (DAG->TII->enableClusterStores()) + DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI)); + } if (EnableMacroFusion) DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI)); return DAG; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 904a124f5cd..91b2e143e30 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1445,6 +1445,11 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfs( case AArch64::LDRWui: case AArch64::LDRSWui: // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURXi: + case AArch64::STURWi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: @@ -1554,15 +1559,20 @@ static bool scaleOffset(unsigned Opc, int64_t &Offset) { default: return false; case AArch64::LDURQi: + case AArch64::STURQi: OffsetStride = 16; break; case AArch64::LDURXi: case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: OffsetStride = 8; break; case AArch64::LDURWi: case AArch64::LDURSi: case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: OffsetStride = 4; break; } @@ -1598,9 +1608,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. -bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { // Only cluster up to a single pair. if (NumLoads > 1) return false; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 20967d5c91d..4b7e7e00b45 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -109,7 +109,9 @@ public: bool enableClusterLoads() const override { return true; } - bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, + bool enableClusterStores() const override { return true; } + + bool shouldClusterMemOps(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const override; bool shouldScheduleAdjacent(MachineInstr *First, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ba5b8a229b7..d4099bfd538 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -292,9 +292,9 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { +bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ad69f43bf30..a5cd2e18ef4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -94,9 +94,9 @@ public: int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const final; + bool shouldClusterMemOps(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, |