diff options
| author | Gerolf Hoflehner <ghoflehner@apple.com> | 2016-04-24 05:14:01 +0000 |
|---|---|---|
| committer | Gerolf Hoflehner <ghoflehner@apple.com> | 2016-04-24 05:14:01 +0000 |
| commit | 01b3a6184aae0b5ef1e2cccd4bb8da8d6a0df68a (patch) | |
| tree | 935cd91c560d5ade3b118bd4ede0ec519bc26601 /llvm/lib/CodeGen | |
| parent | 9cf3bf659c7f2e447517115443bc1efb1693b541 (diff) | |
| download | bcm5719-llvm-01b3a6184aae0b5ef1e2cccd4bb8da8d6a0df68a.tar.gz bcm5719-llvm-01b3a6184aae0b5ef1e2cccd4bb8da8d6a0df68a.zip | |
[MachineCombiner] Support for floating-point FMA on ARM64 (re-commit r267098)
The original patch caused crashes because it could derefence a null pointer
for SelectionDAGTargetInfo for targets that do not define it.
Evaluates fmul+fadd -> fmadd combines and similar code sequences in the
machine combiner. It adds support for float and double similar to the existing
integer implementation. The key features are:
- DAGCombiner checks whether it should combine greedily or let the machine
combiner do the evaluation. This is only supported on ARM64.
- It gives preference to throughput over latency: the heuristic used is
to combine always in loops. The targets decides whether the machine
combiner should optimize for throughput or latency.
- Supports for fmadd, f(n)msub, fmla, fmls patterns
- On by default at O3 ffast-math
llvm-svn: 267328
Diffstat (limited to 'llvm/lib/CodeGen')
| -rw-r--r-- | llvm/lib/CodeGen/MachineCombiner.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/TargetInstrInfo.cpp | 6 |
3 files changed, 26 insertions, 3 deletions
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index 44601d5e462..6b5c6ba8250 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -40,6 +40,7 @@ class MachineCombiner : public MachineFunctionPass { const TargetRegisterInfo *TRI; MCSchedModel SchedModel; MachineRegisterInfo *MRI; + MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; @@ -86,6 +87,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID; INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) @@ -93,6 +95,7 @@ INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); AU.addRequired<MachineTraceMetrics>(); AU.addPreserved<MachineTraceMetrics>(); @@ -354,6 +357,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); auto BlockIter = MBB->begin(); + // Check if the block is in a loop. + const MachineLoop *ML = MLI->getLoopFor(MBB); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -406,11 +411,15 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { if (!NewInstCount) continue; + bool SubstituteAlways = false; + if (ML && TII->isThroughputPattern(P)) + SubstituteAlways = true; + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (doSubstitute(NewInstCount, OldInstCount) || + if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { @@ -447,6 +456,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { SchedModel = STI.getSchedModel(); TSchedModel.init(SchedModel, &STI, TII); MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; OptSize = MF.getFunction()->optForSize(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 703d33bff17..f740e59af96 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -7716,6 +7717,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); + ; + if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel)) + return SDValue(); + // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); @@ -7899,6 +7905,10 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); + if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel)) + return SDValue(); + // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); @@ -8367,7 +8377,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { AddToWorklist(Fused.getNode()); return Fused; } - return SDValue(); } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 86517d9afbc..800ad6d1bb4 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -655,7 +655,11 @@ bool TargetInstrInfo::getMachineCombinerPatterns( return false; } - +/// Return true when a code sequence can improve loop throughput. +bool +TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + return false; +} /// Attempt the reassociation transformation to reduce critical path length. /// See the above comments before getMachineCombinerPatterns(). void TargetInstrInfo::reassociateOps( |

