diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 18 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 83 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 200 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 23 |
5 files changed, 262 insertions, 70 deletions
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4b589f5b447..b1c2031c7d7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4310,12 +4310,30 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::VLD2DUPd8wb_register: case ARM::VLD2DUPd16wb_register: case ARM::VLD2DUPd32wb_register: + case ARM::VLD2DUPq8EvenPseudo: + case ARM::VLD2DUPq8OddPseudo: + case ARM::VLD2DUPq16EvenPseudo: + case ARM::VLD2DUPq16OddPseudo: + case ARM::VLD2DUPq32EvenPseudo: + case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD3DUPq8EvenPseudo: + case ARM::VLD3DUPq8OddPseudo: + case ARM::VLD3DUPq16EvenPseudo: + case ARM::VLD3DUPq16OddPseudo: + case ARM::VLD3DUPq32EvenPseudo: + case ARM::VLD3DUPq32OddPseudo: case ARM::VLD4DUPd8Pseudo: case ARM::VLD4DUPd16Pseudo: case ARM::VLD4DUPd32Pseudo: case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD4DUPq8EvenPseudo: + case ARM::VLD4DUPq8OddPseudo: + case ARM::VLD4DUPq16EvenPseudo: + case ARM::VLD4DUPq16OddPseudo: + case ARM::VLD4DUPq32EvenPseudo: + case ARM::VLD4DUPq32OddPseudo: case ARM::VLD1LNq8Pseudo: case ARM::VLD1LNq16Pseudo: case ARM::VLD1LNq32Pseudo: diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index d82bef5b759..439ba9e2c78 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -186,6 +186,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false}, { ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false}, +{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false}, +{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false}, + { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true}, @@ -213,6 +220,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true}, { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true}, { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, +{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true}, +{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, @@ -248,6 +261,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true}, { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true}, { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, +{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true}, +{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, @@ -463,15 +482,31 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { bool DstIsDead = MI.getOperand(OpIdx).isDead(); unsigned DstReg = MI.getOperand(OpIdx++).getReg(); - unsigned D0, D1, D2, D3; - GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); - MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 1 && TableEntry->copyAllListRegs) - MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 2 && TableEntry->copyAllListRegs) - MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 3 && TableEntry->copyAllListRegs) - MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || + TableEntry->RealOpc == ARM::VLD2DUPd16x2 || + TableEntry->RealOpc == ARM::VLD2DUPd32x2) { + unsigned SubRegIndex; + if (RegSpc == EvenDblSpc) { + SubRegIndex = ARM::dsub_0; + } else { + assert(RegSpc == OddDblSpc && "Unexpected spacing!"); + SubRegIndex = ARM::dsub_1; + } + unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex); + unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, + &ARM::DPairSpcRegClass); + MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); + } else { + unsigned D0, D1, D2, D3; + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1 && TableEntry->copyAllListRegs) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2 && TableEntry->copyAllListRegs) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3 && TableEntry->copyAllListRegs) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); @@ -510,10 +545,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; - if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || - RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || - RegSpc == SingleHighTSpc) - SrcOpIdx = OpIdx++; + if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 && + TableEntry->RealOpc != ARM::VLD2DUPd16x2 && + TableEntry->RealOpc != ARM::VLD2DUPd32x2) { + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || + RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || + RegSpc == SingleHighTSpc) + SrcOpIdx = OpIdx++; + } // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); @@ -1674,6 +1713,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD2DUPq8EvenPseudo: + case ARM::VLD2DUPq8OddPseudo: + case ARM::VLD2DUPq16EvenPseudo: + case ARM::VLD2DUPq16OddPseudo: + case ARM::VLD2DUPq32EvenPseudo: + case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD3DUPq8EvenPseudo: + case ARM::VLD3DUPq8OddPseudo: + case ARM::VLD3DUPq16EvenPseudo: + case ARM::VLD3DUPq16OddPseudo: + case ARM::VLD3DUPq32EvenPseudo: + case ARM::VLD3DUPq32OddPseudo: + case ARM::VLD4DUPq8EvenPseudo: + case ARM::VLD4DUPq8OddPseudo: + case ARM::VLD4DUPq16EvenPseudo: + case ARM::VLD4DUPq16OddPseudo: + case ARM::VLD4DUPq32EvenPseudo: + case ARM::VLD4DUPq32OddPseudo: ExpandVLD(MBBI); return true; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index c3c44d71092..b5eb57cfb8e 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -203,10 +203,11 @@ private: /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used - /// for loading D registers. (Q registers are not supported.) - void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes = nullptr); + /// for loading D registers. + void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, + unsigned NumVecs, const uint16_t *DOpcodes, + const uint16_t *QOpcodes0 = nullptr, + const uint16_t *QOpcodes1 = nullptr); /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); @@ -1747,7 +1748,9 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -1883,7 +1886,9 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -2033,7 +2038,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, SDLoc dl(N); SDValue MemAddr, Align; - unsigned AddrOpIdx = isUpdating ? 1 : 2; + bool IsIntrinsic = !isUpdating; // By coincidence, all supported updating + // nodes are not intrinsics. + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; @@ -2149,21 +2156,22 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, CurDAG->RemoveDeadNode(N); } -void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, +void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, + bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, - const uint16_t *QOpcodes) { + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + unsigned AddrOpIdx = IsIntrinsic ? 2 : 1; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); - SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); unsigned Alignment = 0; if (NumVecs != 3) { @@ -2180,49 +2188,84 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, } Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); - unsigned Opc; + unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld-dup type"); - case MVT::v8i8: Opc = DOpcodes[0]; break; - case MVT::v16i8: Opc = QOpcodes[0]; break; - case MVT::v4i16: Opc = DOpcodes[1]; break; - case MVT::v8i16: Opc = QOpcodes[1]; break; + case MVT::v8i8: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v4i16: + case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v2f32: - case MVT::v2i32: Opc = DOpcodes[2]; break; + case MVT::v2i32: case MVT::v4f32: - case MVT::v4i32: Opc = QOpcodes[2]; break; - } - - SDValue Pred = getAL(CurDAG, dl); - SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector<SDValue, 6> Ops; - Ops.push_back(MemAddr); - Ops.push_back(Align); - if (isUpdating) { - // fixed-stride update instructions don't have an explicit writeback - // operand. It's implicit in the opcode itself. - SDValue Inc = N->getOperand(2); - bool IsImmUpdate = - isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); - if (NumVecs <= 2 && !IsImmUpdate) - Opc = getVLDSTRegisterUpdateOpcode(Opc); - if (!IsImmUpdate) - Ops.push_back(Inc); - // FIXME: VLD3 and VLD4 haven't been updated to that form yet. - else if (NumVecs > 2) - Ops.push_back(Reg0); + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v1f64: + case MVT::v1i64: OpcodeIndex = 3; break; } - Ops.push_back(Pred); - Ops.push_back(Reg0); - Ops.push_back(Chain); unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); + std::vector<EVT> ResTys; - ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); + ResTys.push_back(ResTy); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); - SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SDNode *VLdDup; + if (is64BitVector || NumVecs == 1) { + SmallVector<SDValue, 6> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]; + if (isUpdating) { + // fixed-stride update instructions don't have an explicit writeback + // operand. It's implicit in the opcode itself. + SDValue Inc = N->getOperand(2); + bool IsImmUpdate = + isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); + if (NumVecs <= 2 && !IsImmUpdate) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + if (!IsImmUpdate) + Ops.push_back(Inc); + // FIXME: VLD3 and VLD4 haven't been updated to that form yet. + else if (NumVecs > 2) + Ops.push_back(Reg0); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + } else if (NumVecs == 2) { + const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], + dl, ResTys, OpsA); + + Chain = SDValue(VLdA, 1); + const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain }; + VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); + } else { + SDValue ImplDef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); + const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], + dl, ResTys, OpsA); + + SDValue SuperReg = SDValue(VLdA, 0); + Chain = SDValue(VLdA, 1); + const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain }; + VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB); + } + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); // Extract the subregisters. @@ -2231,10 +2274,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, } else { SDValue SuperReg = SDValue(VLdDup, 0); static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); - unsigned SubIdx = ARM::dsub_0; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec != NumVecs; ++Vec) { ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + } } ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); if (isUpdating) @@ -3066,14 +3110,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD1DUPd32 }; static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16, ARM::VLD1DUPq32 }; - SelectVLDDup(N, false, 1, DOpcodes, QOpcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes); return; } case ARMISD::VLD2DUP: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32 }; - SelectVLDDup(N, false, 2, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes); return; } @@ -3081,7 +3125,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo }; - SelectVLDDup(N, false, 3, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes); return; } @@ -3089,7 +3133,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo }; - SelectVLDDup(N, false, 4, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes); return; } @@ -3100,7 +3144,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed, ARM::VLD1DUPq16wb_fixed, ARM::VLD1DUPq32wb_fixed }; - SelectVLDDup(N, true, 1, DOpcodes, QOpcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes); return; } @@ -3108,7 +3152,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, ARM::VLD2DUPd32wb_fixed }; - SelectVLDDup(N, true, 2, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes); return; } @@ -3116,7 +3160,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd32Pseudo_UPD }; - SelectVLDDup(N, true, 3, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes); return; } @@ -3124,7 +3168,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd32Pseudo_UPD }; - SelectVLDDup(N, true, 4, Opcodes); + SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes); return; } @@ -3531,6 +3575,52 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; } + case Intrinsic::arm_neon_vld2dup: { + static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, + ARM::VLD2DUPd32, ARM::VLD1q64 }; + static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo, + ARM::VLD2DUPq16EvenPseudo, + ARM::VLD2DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo, + ARM::VLD2DUPq16OddPseudo, + ARM::VLD2DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vld3dup: { + static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo, + ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo, + ARM::VLD1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo, + ARM::VLD3DUPq16EvenPseudo, + ARM::VLD3DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo, + ARM::VLD3DUPq16OddPseudo, + ARM::VLD3DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vld4dup: { + static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo, + ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo, + ARM::VLD1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo, + ARM::VLD4DUPq16EvenPseudo, + ARM::VLD4DUPq32EvenPseudo }; + static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo, + ARM::VLD4DUPq16OddPseudo, + ARM::VLD4DUPq32OddPseudo }; + SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4, + DOpcodes, QOpcodes0, QOpcodes1); + return; + } + case Intrinsic::arm_neon_vld2lane: { static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 673bc8dd47a..f3288cb0cfa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12772,6 +12772,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vld2dup: + case Intrinsic::arm_neon_vld3dup: + case Intrinsic::arm_neon_vld4dup: case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: @@ -14066,7 +14069,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: - case Intrinsic::arm_neon_vld4lane: { + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vld2dup: + case Intrinsic::arm_neon_vld3dup: + case Intrinsic::arm_neon_vld4dup: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 58fcf21dd19..4525eec8da0 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -209,7 +209,7 @@ def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass { let ParserMethod = "parseVectorList"; let RenderMethod = "addVecListOperands"; } -def VecListDPairSpacedAllLanes : RegisterOperand<DPair, +def VecListDPairSpacedAllLanes : RegisterOperand<DPairSpc, "printVectorListTwoSpacedAllLanes"> { let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand; } @@ -1518,6 +1518,13 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes, def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, addrmode6dupalign64>; +def VLD2DUPq8EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; +def VLD2DUPq8OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; +def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; +def VLD2DUPq16OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; +def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; +def VLD2DUPq32OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>; + // ...with address register writeback: multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode> { @@ -1578,6 +1585,13 @@ def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">; def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">; def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">; +def VLD3DUPq8EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPq8OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPq16OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPq32OddPseudo : VLDQQQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; + // ...with address register writeback: class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), @@ -1624,6 +1638,13 @@ def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">; def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">; def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } +def VLD4DUPq8EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPq8OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPq16EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPq16OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPq32EvenPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPq32OddPseudo : VLDQQQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; + // ...with address register writeback: class VLD4DUPWB<bits<4> op7_4, string Dt> : NLdSt<1, 0b10, 0b1111, op7_4, |

