summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp')
-rw-r--r--llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp371
1 files changed, 371 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
new file mode 100644
index 00000000000..e3b1d7cea48
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -0,0 +1,371 @@
+//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs optimization for vector by element
+// SIMD instructions.
+//
+// Certain SIMD instructions with vector element operand are not efficient.
+// Rewrite them into SIMD instructions with vector operands. This rewrite
+// is driven by the latency of the instructions.
+//
+// Example:
+// fmla v0.4s, v1.4s, v2.s[1]
+// is rewritten into
+// dup v3.4s, v2.s[1]
+// fmla v0.4s, v1.4s, v3.4s
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
+
+STATISTIC(NumModifiedInstr,
+ "Number of vector by element instructions modified");
+
+#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
+ "AArch64 vector by element instruction optimization pass"
+
+namespace {
+
+struct AArch64VectorByElementOpt : public MachineFunctionPass {
+ static char ID;
+ AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+ initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ TargetSchedModel SchedModel;
+
+ /// Based only on latency of instructions, determine if it is cost efficient
+ /// to replace the instruction InstDesc by the two instructions InstDescRep1
+ /// and InstDescRep2.
+ /// Return true if replacement is recommended.
+ bool
+ shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1,
+ const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const;
+
+ /// Determine if we need to exit the vector by element instruction
+ /// optimization pass early. This makes sure that Targets with no need
+ /// for this optimization do not spent any compile time on this pass.
+ /// This check is done by comparing the latency of an indexed FMLA
+ /// instruction to the latency of the DUP + the latency of a vector
+ /// FMLA instruction. We do not check on other related instructions such
+ /// as FMLS as we assume that if the situation shows up for one
+ /// instruction, then it is likely to show up for the related ones.
+ /// Return true if early exit of the pass is recommended.
+ bool earlyExitVectElement(MachineFunction *MF);
+
+ /// Check whether an equivalent DUP instruction has already been
+ /// created or not.
+ /// Return true when the dup instruction already exists. In this case,
+ /// DestReg will point to the destination of the already created DUP.
+ bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
+ unsigned LaneNumber, unsigned *DestReg) const;
+
+ /// Certain SIMD instructions with vector element operand are not efficient.
+ /// Rewrite them into SIMD instructions with vector operands. This rewrite
+ /// is driven by the latency of the instructions.
+ /// Return true if the SIMD instruction is modified.
+ bool optimizeVectElement(MachineInstr &MI,
+ std::map<unsigned, bool> *VecInstElemTable) const;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
+ }
+};
+char AArch64VectorByElementOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
+ AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
+
+/// Based only on latency of instructions, determine if it is cost efficient
+/// to replace the instruction InstDesc by the two instructions InstDescRep1
+/// and InstDescRep2. Note that it is assumed in this fuction that an
+/// instruction of type InstDesc is always replaced by the same two
+/// instructions as results are cached here.
+/// Return true if replacement is recommended.
+bool AArch64VectorByElementOpt::shouldReplaceInstruction(
+ MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const {
+ // Check if replacment decision is alredy available in the cached table.
+ // if so, return it.
+ if (!VecInstElemTable.empty() &&
+ VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
+ return VecInstElemTable[InstDesc->getOpcode()];
+
+ unsigned SCIdx = InstDesc->getSchedClass();
+ unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
+ unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+ const MCSchedClassDesc *SCDescRep1 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
+ const MCSchedClassDesc *SCDescRep2 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
+
+ // If a subtarget does not define resources for any of the instructions
+ // of interest, then return false for no replacement.
+ if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
+ SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
+ SCDescRep2->isVariant()) {
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+ }
+
+ if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
+ SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
+ SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
+ VecInstElemTable[InstDesc->getOpcode()] = true;
+ return true;
+ }
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+}
+
+/// Determine if we need to exit the vector by element instruction
+/// optimization pass early. This makes sure that Targets with no need
+/// for this optimization do not spent any compile time on this pass.
+/// This check is done by comparing the latency of an indexed FMLA
+/// instruction to the latency of the DUP + the latency of a vector
+/// FMLA instruction. We do not check on other related instructions such
+/// as FMLS as we assume that if the situation shows up for one
+/// instruction, then it is likely to show up for the related ones.
+/// Return true if early exit of the pass is recommended.
+bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
+ std::map<unsigned, bool> VecInstElemTable;
+ const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
+ const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
+
+ if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
+ VecInstElemTable))
+ return true;
+ return false;
+}
+
+/// Check whether an equivalent DUP instruction has already been
+/// created or not.
+/// Return true when the dup instruction already exists. In this case,
+/// DestReg will point to the destination of the already created DUP.
+bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
+ unsigned SrcReg, unsigned LaneNumber,
+ unsigned *DestReg) const {
+ for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
+ MII != MIE;) {
+ MII--;
+ MachineInstr *CurrentMI = &*MII;
+
+ if (CurrentMI->getOpcode() == DupOpcode &&
+ CurrentMI->getNumOperands() == 3 &&
+ CurrentMI->getOperand(1).getReg() == SrcReg &&
+ CurrentMI->getOperand(2).getImm() == LaneNumber) {
+ *DestReg = CurrentMI->getOperand(0).getReg();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Certain SIMD instructions with vector element operand are not efficient.
+/// Rewrite them into SIMD instructions with vector operands. This rewrite
+/// is driven by the latency of the instructions.
+/// The instruction of concerns are for the time being fmla, fmls, fmul,
+/// and fmulx and hence they are hardcoded.
+///
+/// Example:
+/// fmla v0.4s, v1.4s, v2.s[1]
+/// is rewritten into
+/// dup v3.4s, v2.s[1] // dup not necessary if redundant
+/// fmla v0.4s, v1.4s, v3.4s
+/// Return true if the SIMD instruction is modified.
+bool AArch64VectorByElementOpt::optimizeVectElement(
+ MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
+ const MCInstrDesc *MulMCID, *DupMCID;
+ const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ // 4X32 instructions
+ case AArch64::FMLAv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv4f32);
+ break;
+ case AArch64::FMLSv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv4f32);
+ break;
+ case AArch64::FMULXv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv4f32);
+ break;
+ case AArch64::FMULv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULv4f32);
+ break;
+
+ // 2X64 instructions
+ case AArch64::FMLAv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f64);
+ break;
+ case AArch64::FMLSv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f64);
+ break;
+ case AArch64::FMULXv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f64);
+ break;
+ case AArch64::FMULv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULv2f64);
+ break;
+
+ // 2X32 instructions
+ case AArch64::FMLAv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f32);
+ break;
+ case AArch64::FMLSv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f32);
+ break;
+ case AArch64::FMULXv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f32);
+ break;
+ case AArch64::FMULv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULv2f32);
+ break;
+ }
+
+ if (!shouldReplaceInstruction(MI.getParent()->getParent(),
+ &TII->get(MI.getOpcode()), DupMCID, MulMCID,
+ *VecInstElemTable))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ // get the operands of the current SIMD arithmetic instruction.
+ unsigned MulDest = MI.getOperand(0).getReg();
+ unsigned SrcReg0 = MI.getOperand(1).getReg();
+ unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
+ unsigned SrcReg1 = MI.getOperand(2).getReg();
+ unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
+ unsigned DupDest;
+
+ // Instructions of interest have either 4 or 5 operands.
+ if (MI.getNumOperands() == 5) {
+ unsigned SrcReg2 = MI.getOperand(3).getReg();
+ unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
+ unsigned LaneNumber = MI.getOperand(4).getImm();
+
+ // Create a new DUP instruction. Note that if an equivalent DUP instruction
+ // has already been created before, then use that one instread of creating
+ // a new one.
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg2, Src2IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(SrcReg1, Src1IsKill)
+ .addReg(DupDest, Src2IsKill);
+ } else if (MI.getNumOperands() == 4) {
+ unsigned LaneNumber = MI.getOperand(3).getImm();
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg1, Src1IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(DupDest, Src1IsKill);
+ } else {
+ return false;
+ }
+
+ ++NumModifiedInstr;
+ return true;
+}
+
+bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+ MRI = &MF.getRegInfo();
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ const AArch64InstrInfo *AAII =
+ static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ if (!AAII)
+ return false;
+ SchedModel.init(ST.getSchedModel(), &ST, AAII);
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+
+ // A simple check to exit this pass early for targets that do not need it.
+ if (earlyExitVectElement(&MF))
+ return false;
+
+ bool Changed = false;
+ std::map<unsigned, bool> VecInstElemTable;
+ SmallVector<MachineInstr *, 8> RemoveMIs;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII;
+ if (optimizeVectElement(MI, &VecInstElemTable)) {
+ // Add MI to the list of instructions to be removed given that it has
+ // been replaced.
+ RemoveMIs.push_back(&MI);
+ Changed = true;
+ }
+ ++MII;
+ }
+ }
+
+ for (MachineInstr *MI : RemoveMIs)
+ MI->eraseFromParent();
+
+ return Changed;
+}
+
+/// createAArch64VectorByElementOptPass - returns an instance of the
+/// vector by element optimization pass.
+FunctionPass *llvm::createAArch64VectorByElementOptPass() {
+ return new AArch64VectorByElementOpt();
+}
OpenPOWER on IntegriCloud