diff options
-rw-r--r-- | llvm/include/llvm/CodeGen/MachineCombinerPattern.h | 35 | ||||
-rw-r--r-- | llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h | 6 | ||||
-rw-r--r-- | llvm/include/llvm/Target/TargetInstrInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/CodeGen/MachineCombiner.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/CodeGen/TargetInstrInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 580 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-fma-combines.ll | 136 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-fml-combines.ll | 128 |
12 files changed, 40 insertions, 893 deletions
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h index 11238016d44..f3891227746 100644 --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -38,40 +38,7 @@ enum class MachineCombinerPattern { MULSUBX_OP1, MULSUBX_OP2, MULADDXI_OP1, - MULSUBXI_OP1, - // Floating Point - FMULADDS_OP1, - FMULADDS_OP2, - FMULSUBS_OP1, - FMULSUBS_OP2, - FMULADDD_OP1, - FMULADDD_OP2, - FMULSUBD_OP1, - FMULSUBD_OP2, - FMLAv1i32_indexed_OP1, - FMLAv1i32_indexed_OP2, - FMLAv1i64_indexed_OP1, - FMLAv1i64_indexed_OP2, - FMLAv2f32_OP2, - FMLAv2f32_OP1, - FMLAv2f64_OP1, - FMLAv2f64_OP2, - FMLAv2i32_indexed_OP1, - FMLAv2i32_indexed_OP2, - FMLAv2i64_indexed_OP1, - FMLAv2i64_indexed_OP2, - FMLAv4f32_OP1, - FMLAv4f32_OP2, - FMLAv4i32_indexed_OP1, - FMLAv4i32_indexed_OP2, - FMLSv1i32_indexed_OP2, - FMLSv1i64_indexed_OP2, - FMLSv2i32_indexed_OP2, - FMLSv2i64_indexed_OP2, - FMLSv2f32_OP2, - FMLSv2f64_OP2, - FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + MULSUBXI_OP1 }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h index 2fe9e342dec..2e0339b92d8 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -17,7 +17,6 @@ #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H #include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/Support/CodeGen.h" namespace llvm { @@ -139,11 +138,6 @@ public: MachinePointerInfo SrcPtrInfo) const { return std::make_pair(SDValue(), SDValue()); } - // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather - // than FMUL and ADD is delegated to the machine combiner. - virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const { - return false; - } }; } // end llvm namespace diff --git a/llvm/include/llvm/Target/TargetInstrInfo.h b/llvm/include/llvm/Target/TargetInstrInfo.h index bdb3be2d6e4..955b5cfc9c5 100644 --- a/llvm/include/llvm/Target/TargetInstrInfo.h +++ b/llvm/include/llvm/Target/TargetInstrInfo.h @@ -818,11 +818,6 @@ public: MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const; - /// Return true when a code sequence can improve throughput. It - /// should be called only for instructions in loops. - /// \param Pattern - combiner pattern - virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const; - /// Return true if the input \P Inst is part of a chain of dependent ops /// that are suitable for reassociation, otherwise return false. /// If the instruction's operands must be commuted to have a previous diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index 6b5c6ba8250..44601d5e462 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -40,7 +40,6 @@ class MachineCombiner : public MachineFunctionPass { const TargetRegisterInfo *TRI; MCSchedModel SchedModel; MachineRegisterInfo *MRI; - MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; @@ -87,7 +86,6 @@ char &llvm::MachineCombinerID = MachineCombiner::ID; INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) @@ -95,7 +93,6 @@ INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved<MachineDominatorTree>(); - AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); AU.addRequired<MachineTraceMetrics>(); AU.addPreserved<MachineTraceMetrics>(); @@ -357,8 +354,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); auto BlockIter = MBB->begin(); - // Check if the block is in a loop. - const MachineLoop *ML = MLI->getLoopFor(MBB); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -411,15 +406,11 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { if (!NewInstCount) continue; - bool SubstituteAlways = false; - if (ML && TII->isThroughputPattern(P)) - SubstituteAlways = true; - // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || + if (doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { @@ -456,7 +447,6 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { SchedModel = STI.getSchedModel(); TSchedModel.init(SchedModel, &STI, TII); MRI = &MF.getRegInfo(); - MLI = &getAnalysis<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; OptSize = MF.getFunction()->optForSize(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ff3fee2b4cb..6c9800824ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24,7 +24,6 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -86,7 +85,6 @@ namespace { class DAGCombiner { SelectionDAG &DAG; - const SelectionDAGTargetInfo &STI; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; @@ -471,9 +469,8 @@ namespace { public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) - : DAG(D), STI(D.getSelectionDAGInfo()), TLI(D.getTargetLoweringInfo()), - Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), - LegalTypes(false), AA(A) { + : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); } @@ -7718,9 +7715,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); - if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel)) - return SDValue(); - // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); @@ -7904,9 +7898,6 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); - if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel)) - return SDValue(); - // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 800ad6d1bb4..86517d9afbc 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -655,11 +655,7 @@ bool TargetInstrInfo::getMachineCombinerPatterns( return false; } -/// Return true when a code sequence can improve loop throughput. -bool -TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { - return false; -} + /// Attempt the reassociation transformation to reduce critical path length. /// See the above comments before getMachineCombinerPatterns(). void TargetInstrInfo::reassociateOps( diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index fd137db1a0b..b0574f2de75 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2788,75 +2788,37 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } // -// FP Opcodes that can be combined with a FMUL -static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { - switch (Inst.getOpcode()) { - case AArch64::FADDSrr: - case AArch64::FADDDrr: - case AArch64::FADDv2f32: - case AArch64::FADDv2f64: - case AArch64::FADDv4f32: - case AArch64::FSUBSrr: - case AArch64::FSUBDrr: - case AArch64::FSUBv2f32: - case AArch64::FSUBv2f64: - case AArch64::FSUBv4f32: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; - default: - break; - } - return false; -} -// // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } -// -// Utility routine that checks if \param MO is defined by an -// \param CombineOpc instruction in the basic block \param MBB -static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned CombineOpc, unsigned ZeroReg = 0, - bool CheckZeroReg = false) { +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - + // We need a virtual register definition. if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) + return false; + + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) return false; + // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; - if (CheckZeroReg) { - assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); - // The third input reg must be zero. - if (MI->getOperand(3).getReg() != ZeroReg) - return false; - } - return true; } -// -// Is \param MO defined by an integer multiply and can be combined? -static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc, unsigned ZeroReg) { - return canCombine(MBB, MO, MulOpc, ZeroReg, true); -} - -// -// Is \param MO defined by a floating-point multiply and can be combined? -static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc) { - return canCombine(MBB, MO, MulOpc); -} - // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) @@ -2990,230 +2952,7 @@ static bool getMaddPatterns(MachineInstr &Root, } return Found; } -/// Floating-Point Support -/// Find instructions that can be turned into madd. -static bool getFMAPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) { - - if (!isCombineInstrCandidateFP(Root)) - return 0; - - MachineBasicBlock &MBB = *Root.getParent(); - bool Found = false; - - switch (Root.getOpcode()) { - default: - assert(false && "Unsupported FP instruction in combiner\n"); - break; - case AArch64::FADDSrr: - assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && - "FADDWrr does not have register operands"); - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); - Found = true; - } - break; - case AArch64::FADDDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); - Found = true; - } - break; - case AArch64::FADDv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); - Found = true; - } - break; - case AArch64::FADDv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); - Found = true; - } - break; - case AArch64::FADDv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); - Found = true; - } - break; - - case AArch64::FSUBSrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); - Found = true; - } - break; - case AArch64::FSUBDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); - Found = true; - } - break; - case AArch64::FSUBv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); - Found = true; - } - break; - case AArch64::FSUBv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); - Found = true; - } - break; - case AArch64::FSUBv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); - Found = true; - } - break; - } - return Found; -} - -/// Return true when a code sequence can improve throughput. It -/// should be called only for instructions in loops. -/// \param Pattern - combiner pattern -bool -AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { - switch (Pattern) { - default: - break; - case MachineCombinerPattern::FMULADDS_OP1: - case MachineCombinerPattern::FMULADDS_OP2: - case MachineCombinerPattern::FMULSUBS_OP1: - case MachineCombinerPattern::FMULSUBS_OP2: - case MachineCombinerPattern::FMULADDD_OP1: - case MachineCombinerPattern::FMULADDD_OP2: - case MachineCombinerPattern::FMULSUBD_OP1: - case MachineCombinerPattern::FMULSUBD_OP2: - case MachineCombinerPattern::FMLAv1i32_indexed_OP1: - case MachineCombinerPattern::FMLAv1i32_indexed_OP2: - case MachineCombinerPattern::FMLAv1i64_indexed_OP1: - case MachineCombinerPattern::FMLAv1i64_indexed_OP2: - case MachineCombinerPattern::FMLAv2f32_OP2: - case MachineCombinerPattern::FMLAv2f32_OP1: - case MachineCombinerPattern::FMLAv2f64_OP1: - case MachineCombinerPattern::FMLAv2f64_OP2: - case MachineCombinerPattern::FMLAv2i32_indexed_OP1: - case MachineCombinerPattern::FMLAv2i32_indexed_OP2: - case MachineCombinerPattern::FMLAv2i64_indexed_OP1: - case MachineCombinerPattern::FMLAv2i64_indexed_OP2: - case MachineCombinerPattern::FMLAv4f32_OP1: - case MachineCombinerPattern::FMLAv4f32_OP2: - case MachineCombinerPattern::FMLAv4i32_indexed_OP1: - case MachineCombinerPattern::FMLAv4i32_indexed_OP2: - case MachineCombinerPattern::FMLSv1i32_indexed_OP2: - case MachineCombinerPattern::FMLSv1i64_indexed_OP2: - case MachineCombinerPattern::FMLSv2i32_indexed_OP2: - case MachineCombinerPattern::FMLSv2i64_indexed_OP2: - case MachineCombinerPattern::FMLSv2f32_OP2: - case MachineCombinerPattern::FMLSv2f64_OP2: - case MachineCombinerPattern::FMLSv4i32_indexed_OP2: - case MachineCombinerPattern::FMLSv4f32_OP2: - return true; - } // end switch (Pattern) - return false; -} /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -3222,35 +2961,28 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const { - // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; - // Floating point patterns - if (getFMAPatterns(Root, Patterns)) - return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } -enum class FMAInstKind { Default, Indexed, Accumulator }; -/// genFusedMultiply - Generate fused multiply instructions. -/// This function supports both integer and floating point instructions. -/// A typical example: -/// F|MUL I=A,B,0 -/// F|ADD R,I,C -/// ==> F|MADD R,A,B,C -/// \param Root is the F|ADD instruction +/// genMadd - Generate madd instruction and combine mul and add. +/// Example: +/// MUL I=A,B,0 +/// ADD R,I,C +/// ==> MADD R,A,B,C +/// \param Root is the ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of -/// the F|MUL. In the example above IdxMulOpd is 1. -/// \param MaddOpc the opcode fo the f|madd instruction -static MachineInstr * -genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, - const TargetInstrInfo *TII, MachineInstr &Root, - SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, - unsigned MaddOpc, const TargetRegisterClass *RC, - FMAInstKind kind = FMAInstKind::Default) { +/// the MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the madd instruction +static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + const TargetRegisterClass *RC) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -3272,26 +3004,12 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); - MachineInstrBuilder MIB; - if (kind == FMAInstKind::Default) - MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(SrcReg2, getKillRegState(Src2IsKill)); - else if (kind == FMAInstKind::Indexed) - MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) - .addReg(SrcReg2, getKillRegState(Src2IsKill)) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addImm(MUL->getOperand(3).getImm()); - else if (kind == FMAInstKind::Accumulator) - MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) - .addReg(SrcReg2, getKillRegState(Src2IsKill)) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)); - else - assert(false && "Invalid FMA instruction kind \n"); - // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), + ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + // Insert the MADD InsInstrs.push_back(MIB); return MUL; } @@ -3379,7 +3097,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: @@ -3394,7 +3112,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { @@ -3486,7 +3204,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { @@ -3531,234 +3249,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } - // Floating Point Support - case MachineCombinerPattern::FMULADDS_OP1: - case MachineCombinerPattern::FMULADDD_OP1: - // MUL I=A,B,0 - // ADD R,I,C - // ==> MADD R,A,B,C - // --- Create(MADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); - break; - case MachineCombinerPattern::FMULADDS_OP2: - case MachineCombinerPattern::FMULADDD_OP2: - // FMUL I=A,B,0 - // FADD R,C,I - // ==> FMADD R,A,B,C - // --- Create(FMADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); - break; - - case MachineCombinerPattern::FMLAv1i32_indexed_OP1: - Opc = AArch64::FMLAv1i32_indexed; - RC = &AArch64::FPR32RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Indexed); - break; - case MachineCombinerPattern::FMLAv1i32_indexed_OP2: - Opc = AArch64::FMLAv1i32_indexed; - RC = &AArch64::FPR32RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - break; - - case MachineCombinerPattern::FMLAv1i64_indexed_OP1: - Opc = AArch64::FMLAv1i64_indexed; - RC = &AArch64::FPR64RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Indexed); - break; - case MachineCombinerPattern::FMLAv1i64_indexed_OP2: - Opc = AArch64::FMLAv1i64_indexed; - RC = &AArch64::FPR64RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - break; - - case MachineCombinerPattern::FMLAv2i32_indexed_OP1: - case MachineCombinerPattern::FMLAv2f32_OP1: - RC = &AArch64::FPR64RegClass; - if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { - Opc = AArch64::FMLAv2i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv2f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Accumulator); - } - break; - case MachineCombinerPattern::FMLAv2i32_indexed_OP2: - case MachineCombinerPattern::FMLAv2f32_OP2: - RC = &AArch64::FPR64RegClass; - if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { - Opc = AArch64::FMLAv2i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv2f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMLAv2i64_indexed_OP1: - case MachineCombinerPattern::FMLAv2f64_OP1: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { - Opc = AArch64::FMLAv2i64_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv2f64; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Accumulator); - } - break; - case MachineCombinerPattern::FMLAv2i64_indexed_OP2: - case MachineCombinerPattern::FMLAv2f64_OP2: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { - Opc = AArch64::FMLAv2i64_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv2f64; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMLAv4i32_indexed_OP1: - case MachineCombinerPattern::FMLAv4f32_OP1: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { - Opc = AArch64::FMLAv4i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv4f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMLAv4i32_indexed_OP2: - case MachineCombinerPattern::FMLAv4f32_OP2: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { - Opc = AArch64::FMLAv4i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLAv4f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMULSUBS_OP1: - case MachineCombinerPattern::FMULSUBD_OP1: { - // FMUL I=A,B,0 - // FSUB R,I,C - // ==> FNMSUB R,A,B,C // = -C + A*B - // --- Create(FNMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { - Opc = AArch64::FNMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FNMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); - break; - } - case MachineCombinerPattern::FMULSUBS_OP2: - case MachineCombinerPattern::FMULSUBD_OP2: { - // FMUL I=A,B,0 - // FSUB R,C,I - // ==> FMSUB R,A,B,C (computes C - A*B) - // --- Create(FMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { - Opc = AArch64::FMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); - break; - - case MachineCombinerPattern::FMLSv1i32_indexed_OP2: - Opc = AArch64::FMLSv1i32_indexed; - RC = &AArch64::FPR32RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - break; - - case MachineCombinerPattern::FMLSv1i64_indexed_OP2: - Opc = AArch64::FMLSv1i64_indexed; - RC = &AArch64::FPR64RegClass; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - break; - - case MachineCombinerPattern::FMLSv2f32_OP2: - case MachineCombinerPattern::FMLSv2i32_indexed_OP2: - RC = &AArch64::FPR64RegClass; - if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { - Opc = AArch64::FMLSv2i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLSv2f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMLSv2f64_OP2: - case MachineCombinerPattern::FMLSv2i64_indexed_OP2: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { - Opc = AArch64::FMLSv2i64_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLSv2f64; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - - case MachineCombinerPattern::FMLSv4f32_OP2: - case MachineCombinerPattern::FMLSv4i32_indexed_OP2: - RC = &AArch64::FPR128RegClass; - if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { - Opc = AArch64::FMLSv4i32_indexed; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Indexed); - } else { - Opc = AArch64::FMLSv4f32; - MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, - FMAInstKind::Accumulator); - } - break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 353ef735dac..a592f91dd4e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -174,11 +174,6 @@ public: unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr *MI) const override; - - /// Return true when a code sequence can improve throughput. It - /// should be called only for instructions in loops. - /// \param Pattern - combiner pattern - bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in <Root>. All potential patterns are /// listed in the <Patterns> array. diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 4e4aaf8e553..f40293021d7 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -51,9 +51,3 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( } return SDValue(); } -bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner( - CodeGenOpt::Level OptLevel) const { - if (OptLevel >= CodeGenOpt::Aggressive) - return true; - return false; -} diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index e61f177f2ef..8adb030555a 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -25,7 +25,6 @@ public: SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; - bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll deleted file mode 100644 index ab875c06cc6..00000000000 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ /dev/null @@ -1,136 +0,0 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s -define void @foo_2d(double* %src) { -; CHECK-LABEL: %entry -; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -entry: - %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 - %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 - %tmp = bitcast double* %arrayidx1 to <2 x double>* - %tmp1 = load double, double* %arrayidx2, align 8 - %tmp2 = load double, double* %arrayidx1, align 8 - %fmul = fmul fast double %tmp1, %tmp1 - %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B - %fadd = fadd fast double %fmul, %fmul2 - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next - %tmp3 = load double, double* %arrayidx3, align 8 - %add = fadd fast double %tmp3, %tmp3 - %mul = fmul fast double %add, %fadd - %e1 = insertelement <2 x double> undef, double %add, i32 0 - %e2 = insertelement <2 x double> %e1, double %add, i32 1 - %add2 = fadd fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00> - %e3 = insertelement <2 x double> undef, double %mul, i32 0 - %e4 = insertelement <2 x double> %e3, double %mul, i32 1 - %mul2 = fmul fast <2 x double> %add2,<double 3.000000e+00, double -3.000000e+00> - %e5 = insertelement <2 x double> undef, double %add, i32 0 - %e6 = insertelement <2 x double> %e5, double %add, i32 1 - %add3 = fadd fast <2 x double> %mul2, <double 3.000000e+00, double -3.000000e+00> - %mulx = fmul fast <2 x double> %add2, %e2 - %addx = fadd fast <2 x double> %mulx, %e4 - %e7 = insertelement <2 x double> undef, double %mul, i32 0 - %e8 = insertelement <2 x double> %e7, double %mul, i32 1 - %e9 = fmul fast <2 x double> %addx, %add3 - store <2 x double> %e9, <2 x double>* %tmp, align 8 - %e10 = extractelement <2 x double> %add3, i32 0 - %mul3 = fmul fast double %mul, %e10 - %add4 = fadd fast double %mul3, %mul - store double %add4, double* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} -define void @foo_2s(float* %src) { -entry: - %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 - %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 - %tmp = bitcast float* %arrayidx1 to <2 x float>* - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next - %tmp1 = load float, float* %arrayidx3, align 8 - %add = fadd fast float %tmp1, %tmp1 - %mul = fmul fast float %add, %add - %e1 = insertelement <2 x float> undef, float %add, i32 0 - %e2 = insertelement <2 x float> %e1, float %add, i32 1 - %add2 = fadd fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00> - %e3 = insertelement <2 x float> undef, float %mul, i32 0 - %e4 = insertelement <2 x float> %e3, float %mul, i32 1 - %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00> - %e5 = insertelement <2 x float> undef, float %add, i32 0 - %e6 = insertelement <2 x float> %e5, float %add, i32 1 - %add3 = fadd fast <2 x float> %mul2, <float 3.000000e+00, float -3.000000e+00> - %mulx = fmul fast <2 x float> %add2, %e2 - %addx = fadd fast <2 x float> %mulx, %e4 - %e7 = insertelement <2 x float> undef, float %mul, i32 0 - %e8 = insertelement <2 x float> %e7, float %mul, i32 1 - %e9 = fmul fast <2 x float> %addx, %add3 - store <2 x float> %e9, <2 x float>* %tmp, align 8 - %e10 = extractelement <2 x float> %add3, i32 0 - %mul3 = fmul fast float %mul, %e10 - %add4 = fadd fast float %mul3, %mul - store float %add4, float* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} -define void @foo_4s(float* %src) { -entry: - %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 - %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 - %tmp = bitcast float* %arrayidx1 to <4 x float>* - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next - %tmp1 = load float, float* %arrayidx3, align 8 - %add = fadd fast float %tmp1, %tmp1 - %mul = fmul fast float %add, %add - %e1 = insertelement <4 x float> undef, float %add, i32 0 - %e2 = insertelement <4 x float> %e1, float %add, i32 1 - %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> - %e3 = insertelement <4 x float> undef, float %mul, i32 0 - %e4 = insertelement <4 x float> %e3, float %mul, i32 1 - %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> - %e5 = insertelement <4 x float> undef, float %add, i32 0 - %e6 = insertelement <4 x float> %e5, float %add, i32 1 - %add3 = fadd fast <4 x float> %mul2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> - %mulx = fmul fast <4 x float> %add2, %e2 - %addx = fadd fast <4 x float> %mulx, %e4 - %e7 = insertelement <4 x float> undef, float %mul, i32 0 - %e8 = insertelement <4 x float> %e7, float %mul, i32 1 - %e9 = fmul fast <4 x float> %addx, %add3 - store <4 x float> %e9, <4 x float>* %tmp, align 8 - %e10 = extractelement <4 x float> %add3, i32 0 - %mul3 = fmul fast float %mul, %e10 - store float %mul3, float* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} diff --git a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll deleted file mode 100644 index 840d1dcbf06..00000000000 --- a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s -define void @foo_2d(double* %src) { -entry: - %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 - %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 - %tmp = bitcast double* %arrayidx1 to <2 x double>* - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next - %tmp1 = load double, double* %arrayidx3, align 8 - %add = fadd fast double %tmp1, %tmp1 - %mul = fmul fast double %add, %add - %e1 = insertelement <2 x double> undef, double %add, i32 0 - %e2 = insertelement <2 x double> %e1, double %add, i32 1 - %sub2 = fsub fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00> - %e3 = insertelement <2 x double> undef, double %mul, i32 0 - %e4 = insertelement <2 x double> %e3, double %mul, i32 1 - %mul2 = fmul fast <2 x double> %sub2,<double 3.000000e+00, double -3.000000e+00> - %e5 = insertelement <2 x double> undef, double %add, i32 0 - %e6 = insertelement <2 x double> %e5, double %add, i32 1 - %sub3 = fsub fast <2 x double> <double 3.000000e+00, double -3.000000e+00>, %mul2 - %mulx = fmul fast <2 x double> %sub2, %e2 - %subx = fsub fast <2 x double> %e4, %mulx - %e7 = insertelement <2 x double> undef, double %mul, i32 0 - %e8 = insertelement <2 x double> %e7, double %mul, i32 1 - %e9 = fmul fast <2 x double> %subx, %sub3 - store <2 x double> %e9, <2 x double>* %tmp, align 8 - %e10 = extractelement <2 x double> %sub3, i32 0 - %mul3 = fmul fast double %mul, %e10 - %sub4 = fsub fast double %mul, %mul3 - store double %sub4, double* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} -define void @foo_2s(float* %src) { -entry: - %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 - %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 - %tmp = bitcast float* %arrayidx1 to <2 x float>* - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next - %tmp1 = load float, float* %arrayidx3, align 8 - %add = fadd fast float %tmp1, %tmp1 - %mul = fmul fast float %add, %add - %e1 = insertelement <2 x float> undef, float %add, i32 0 - %e2 = insertelement <2 x float> %e1, float %add, i32 1 - %add2 = fsub fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00> - %e3 = insertelement <2 x float> undef, float %mul, i32 0 - %e4 = insertelement <2 x float> %e3, float %mul, i32 1 - %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00> - %e5 = insertelement <2 x float> undef, float %add, i32 0 - %e6 = insertelement <2 x float> %e5, float %add, i32 1 - %add3 = fsub fast <2 x float> <float 3.000000e+00, float -3.000000e+00>, %mul2 - %mulx = fmul fast <2 x float> %add2, %e2 - %addx = fsub fast <2 x float> %e4, %mulx - %e7 = insertelement <2 x float> undef, float %mul, i32 0 - %e8 = insertelement <2 x float> %e7, float %mul, i32 1 - %e9 = fmul fast <2 x float> %addx, %add3 - store <2 x float> %e9, <2 x float>* %tmp, align 8 - %e10 = extractelement <2 x float> %add3, i32 0 - %mul3 = fmul fast float %mul, %e10 - %add4 = fsub fast float %mul, %mul3 - store float %add4, float* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} -define void @foo_4s(float* %src) { -entry: - %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 - %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 - %tmp = bitcast float* %arrayidx1 to <4 x float>* - br label %for.body - -; CHECK-LABEL: %for.body -; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next - %tmp1 = load float, float* %arrayidx3, align 8 - %add = fadd fast float %tmp1, %tmp1 - %mul = fmul fast float %add, %add - %e1 = insertelement <4 x float> undef, float %add, i32 0 - %e2 = insertelement <4 x float> %e1, float %add, i32 1 - %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> - %e3 = insertelement <4 x float> undef, float %mul, i32 0 - %e4 = insertelement <4 x float> %e3, float %mul, i32 1 - %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> - %e5 = insertelement <4 x float> undef, float %add, i32 0 - %e6 = insertelement <4 x float> %e5, float %add, i32 1 - %add3 = fsub fast <4 x float> <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> , %mul2 - %mulx = fmul fast <4 x float> %add2, %e2 - %addx = fsub fast <4 x float> %e4, %mulx - %e7 = insertelement <4 x float> undef, float %mul, i32 0 - %e8 = insertelement <4 x float> %e7, float %mul, i32 1 - %e9 = fmul fast <4 x float> %addx, %add3 - store <4 x float> %e9, <4 x float>* %tmp, align 8 - %e10 = extractelement <4 x float> %add3, i32 0 - %mul3 = fmul fast float %mul, %e10 - store float %mul3, float* %arrayidx2, align 8 - %exitcond = icmp eq i64 %indvars.iv.next, 25 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} |