diff options
| author | Ivan A. Kosarev <ikosarev@accesssoftek.com> | 2018-06-10 09:27:27 +0000 |
|---|---|---|
| committer | Ivan A. Kosarev <ikosarev@accesssoftek.com> | 2018-06-10 09:27:27 +0000 |
| commit | 847daa11f8ff61553ffc45fee92f78cfc5066f94 (patch) | |
| tree | 206072a54a732c46253d9093f3424a35736d074d /llvm/lib/Target | |
| parent | 301d5263297616bee528136b62ddd95e6e321a69 (diff) | |
| download | bcm5719-llvm-847daa11f8ff61553ffc45fee92f78cfc5066f94.tar.gz bcm5719-llvm-847daa11f8ff61553ffc45fee92f78cfc5066f94.zip | |
[NEON] Support VST1xN intrinsics in AArch32 mode (LLVM part)
We currently support them only in AArch64. The NEON Reference,
however, says they are 'ARMv7, ARMv8' intrinsics.
Differential Revision: https://reviews.llvm.org/D47447
llvm-svn: 334361
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 76 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 49 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 24 |
4 files changed, 165 insertions, 8 deletions
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e3d1b1d6968..d82bef5b759 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -284,12 +284,34 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true}, { ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true}, +{ ARM::VST1d16QPseudo, ARM::VST1d16Q, false, false, false, SingleSpc, 4, 4 ,false}, +{ ARM::VST1d16TPseudo, ARM::VST1d16T, false, false, false, SingleSpc, 3, 4 ,false}, +{ ARM::VST1d32QPseudo, ARM::VST1d32Q, false, false, false, SingleSpc, 4, 2 ,false}, +{ ARM::VST1d32TPseudo, ARM::VST1d32T, false, false, false, SingleSpc, 3, 2 ,false}, { ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false}, { ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false}, { ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false}, { ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false}, +{ ARM::VST1d8QPseudo, ARM::VST1d8Q, false, false, false, SingleSpc, 4, 8 ,false}, +{ ARM::VST1d8TPseudo, ARM::VST1d8T, false, false, false, SingleSpc, 3, 8 ,false}, +{ ARM::VST1q16HighQPseudo, ARM::VST1d16Q, false, false, false, SingleHighQSpc, 4, 4 ,false}, +{ ARM::VST1q16HighTPseudo, ARM::VST1d16T, false, false, false, SingleHighTSpc, 3, 4 ,false}, +{ ARM::VST1q16LowQPseudo_UPD, ARM::VST1d16Qwb_fixed, false, true, true, SingleLowSpc, 4, 4 ,false}, +{ ARM::VST1q16LowTPseudo_UPD, ARM::VST1d16Twb_fixed, false, true, true, SingleLowSpc, 3, 4 ,false}, +{ ARM::VST1q32HighQPseudo, ARM::VST1d32Q, false, false, false, SingleHighQSpc, 4, 2 ,false}, +{ ARM::VST1q32HighTPseudo, ARM::VST1d32T, false, false, false, SingleHighTSpc, 3, 2 ,false}, +{ ARM::VST1q32LowQPseudo_UPD, ARM::VST1d32Qwb_fixed, false, true, true, SingleLowSpc, 4, 2 ,false}, +{ ARM::VST1q32LowTPseudo_UPD, ARM::VST1d32Twb_fixed, false, true, true, SingleLowSpc, 3, 2 ,false}, +{ ARM::VST1q64HighQPseudo, ARM::VST1d64Q, false, false, false, SingleHighQSpc, 4, 1 ,false}, +{ ARM::VST1q64HighTPseudo, ARM::VST1d64T, false, false, false, SingleHighTSpc, 3, 1 ,false}, +{ ARM::VST1q64LowQPseudo_UPD, ARM::VST1d64Qwb_fixed, false, true, true, SingleLowSpc, 4, 1 ,false}, +{ ARM::VST1q64LowTPseudo_UPD, ARM::VST1d64Twb_fixed, false, true, true, SingleLowSpc, 3, 1 ,false}, +{ ARM::VST1q8HighQPseudo, ARM::VST1d8Q, false, false, false, SingleHighQSpc, 4, 8 ,false}, +{ ARM::VST1q8HighTPseudo, ARM::VST1d8T, false, false, false, SingleHighTSpc, 3, 8 ,false}, +{ ARM::VST1q8LowQPseudo_UPD, ARM::VST1d8Qwb_fixed, false, true, true, SingleLowSpc, 4, 8 ,false}, +{ ARM::VST1q8LowTPseudo_UPD, ARM::VST1d8Twb_fixed, false, true, true, SingleLowSpc, 3, 8 ,false}, { ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true}, { ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true}, @@ -465,7 +487,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // and register forms. Some real instructions, however, do not rely on // am6offset and have separate definitions for such forms. When this is the // case, fixed forms do not take any offset nodes, so here we skip them for - // such intructions. Once all real and pseudo writing-back instructions are + // such instructions. Once all real and pseudo writing-back instructions are // rewritten without use of am6offset nodes, this code will go away. const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); if (TableEntry->RealOpc == ARM::VLD1d8Qwb_fixed || @@ -477,7 +499,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { TableEntry->RealOpc == ARM::VLD1d32Twb_fixed || TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) { assert(AM6Offset.getReg() == 0 && - "A fixed writing-back pseudo intruction provides an offset " + "A fixed writing-back pseudo instruction provides an offset " "register!"); } else { MIB.add(AM6Offset); @@ -534,9 +556,31 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { // Copy the addrmode6 operands. MIB.add(MI.getOperand(OpIdx++)); MIB.add(MI.getOperand(OpIdx++)); - // Copy the am6offset operand. - if (TableEntry->hasWritebackOperand) - MIB.add(MI.getOperand(OpIdx++)); + + if (TableEntry->hasWritebackOperand) { + // TODO: The writing-back pseudo instructions we translate here are all + // defined to take am6offset nodes that are capable to represent both fixed + // and register forms. Some real instructions, however, do not rely on + // am6offset and have separate definitions for such forms. When this is the + // case, fixed forms do not take any offset nodes, so here we skip them for + // such instructions. Once all real and pseudo writing-back instructions are + // rewritten without use of am6offset nodes, this code will go away. + const MachineOperand &AM6Offset = MI.getOperand(OpIdx++); + if (TableEntry->RealOpc == ARM::VST1d8Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d16Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d32Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d64Qwb_fixed || + TableEntry->RealOpc == ARM::VST1d8Twb_fixed || + TableEntry->RealOpc == ARM::VST1d16Twb_fixed || + TableEntry->RealOpc == ARM::VST1d32Twb_fixed || + TableEntry->RealOpc == ARM::VST1d64Twb_fixed) { + assert(AM6Offset.getReg() == 0 && + "A fixed writing-back pseudo instruction provides an offset " + "register!"); + } else { + MIB.add(AM6Offset); + } + } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); @@ -1645,6 +1689,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VST3d8Pseudo: case ARM::VST3d16Pseudo: case ARM::VST3d32Pseudo: + case ARM::VST1d8TPseudo: + case ARM::VST1d16TPseudo: + case ARM::VST1d32TPseudo: case ARM::VST1d64TPseudo: case ARM::VST3d8Pseudo_UPD: case ARM::VST3d16Pseudo_UPD: @@ -1663,12 +1710,31 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VST4d8Pseudo: case ARM::VST4d16Pseudo: case ARM::VST4d32Pseudo: + case ARM::VST1d8QPseudo: + case ARM::VST1d16QPseudo: + case ARM::VST1d32QPseudo: case ARM::VST1d64QPseudo: case ARM::VST4d8Pseudo_UPD: case ARM::VST4d16Pseudo_UPD: case ARM::VST4d32Pseudo_UPD: case ARM::VST1d64QPseudoWB_fixed: case ARM::VST1d64QPseudoWB_register: + case ARM::VST1q8HighQPseudo: + case ARM::VST1q8LowQPseudo_UPD: + case ARM::VST1q8HighTPseudo: + case ARM::VST1q8LowTPseudo_UPD: + case ARM::VST1q16HighQPseudo: + case ARM::VST1q16LowQPseudo_UPD: + case ARM::VST1q16HighTPseudo: + case ARM::VST1q16LowTPseudo_UPD: + case ARM::VST1q32HighQPseudo: + case ARM::VST1q32LowQPseudo_UPD: + case ARM::VST1q32HighTPseudo: + case ARM::VST1q32LowTPseudo_UPD: + case ARM::VST1q64HighQPseudo: + case ARM::VST1q64LowQPseudo_UPD: + case ARM::VST1q64HighTPseudo: + case ARM::VST1q64LowTPseudo_UPD: case ARM::VST4q8Pseudo_UPD: case ARM::VST4q16Pseudo_UPD: case ARM::VST4q32Pseudo_UPD: diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1a0ffe4e176..7d6963c3608 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1903,9 +1903,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; case MVT::v2f64: - case MVT::v2i64: OpcodeIndex = 3; - assert(NumVecs == 1 && "v2i64 type only supported for VST1"); - break; + case MVT::v2i64: OpcodeIndex = 3; break; } std::vector<EVT> ResTys; @@ -3562,6 +3560,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; } + case Intrinsic::arm_neon_vst1x2: { + static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo, + ARM::VST1d16QPseudo, + ARM::VST1d32QPseudo, + ARM::VST1d64QPseudo }; + SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); + return; + } + + case Intrinsic::arm_neon_vst1x3: { + static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo, + ARM::VST1d16TPseudo, + ARM::VST1d32TPseudo, + ARM::VST1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD, + ARM::VST1q16LowTPseudo_UPD, + ARM::VST1q32LowTPseudo_UPD, + ARM::VST1q64LowTPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo, + ARM::VST1q16HighTPseudo, + ARM::VST1q32HighTPseudo, + ARM::VST1q64HighTPseudo }; + SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + + case Intrinsic::arm_neon_vst1x4: { + static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo, + ARM::VST1d16QPseudo, + ARM::VST1d32QPseudo, + ARM::VST1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD, + ARM::VST1q16LowQPseudo_UPD, + ARM::VST1q32LowQPseudo_UPD, + ARM::VST1q64LowQPseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo, + ARM::VST1q16HighQPseudo, + ARM::VST1q32HighQPseudo, + ARM::VST1q64HighQPseudo }; + SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + case Intrinsic::arm_neon_vst2: { static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, ARM::VST2d32, ARM::VST1q64 }; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 03ccaa33366..673bc8dd47a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12773,6 +12773,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: @@ -14118,6 +14121,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 0; + // volatile stores with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index ade303505d3..8010352d343 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -1801,10 +1801,22 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>; defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; +def VST1d8TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1d16TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1d32TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>; def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>; +def VST1q8HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q8LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q16HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q16LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q32HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q32LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q64HighTPseudo : VSTQQQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1q64LowTPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; + // ...with 4 registers class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0010, op7_4, (outs), @@ -1844,10 +1856,22 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; +def VST1d8QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1d16QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1d32QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>; def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>; +def VST1q8HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q8LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q16HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q16LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q32HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q32LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q64HighQPseudo : VSTQQQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1q64LowQPseudo_UPD : VSTQQQQWBPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; + // VST2 : Vector Store (multiple 2-element structures) class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> |

