diff options
author | Ivan A. Kosarev <ikosarev@accesssoftek.com> | 2018-06-27 13:57:52 +0000 |
---|---|---|
committer | Ivan A. Kosarev <ikosarev@accesssoftek.com> | 2018-06-27 13:57:52 +0000 |
commit | 7231598fce4f89be34a93b328032d3ee3c7bae04 (patch) | |
tree | ea496bd1fda64fde8471e5db6893fdc7e25caef5 /llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | |
parent | de54f584ccca72d069a456d71c7d3d4d400a9198 (diff) | |
download | bcm5719-llvm-7231598fce4f89be34a93b328032d3ee3c7bae04.tar.gz bcm5719-llvm-7231598fce4f89be34a93b328032d3ee3c7bae04.zip |
[NEON] Support vldNq intrinsics in AArch32 (LLVM part)
This patch adds support for the q versions of the dup
(load-to-all-lanes) NEON intrinsics, such as vld2q_dup_f16() for
example.
Currently, non-q versions of the dup intrinsics are implemented
in clang by generating IR that first loads the elements of the
structure into the first lane with the lane (to-single-lane)
intrinsics, and then propagating it other lanes. There are at
least two problems with this approach. First, there are no
double-spaced to-single-lane byte-element instructions. For
example, there is no such instruction as 'vld2.8 { d0[0], d2[0]
}, [r0]'. That means we cannot rely on the to-single-lane
intrinsics and instructions to implement the q versions of the
dup intrinsics. Note that to-all-lanes instructions do support
all sizes of data items, including bytes.
The second problem with the current approach is that we need a
separate vdup instruction to propagate the structure to each
lane. So for vld4q_dup_f16() we would need four vdup instructions
in addition to the initial vld instruction.
This patch introduces dup LLVM intrinsics and reworks handling of
the currently supported (non-q) NEON dup intrinsics to expand
them into those LLVM intrinsics, thus eliminating the need for
using to-single-lane intrinsics and instructions.
Additionally, this patch adds support for u64 and s64 dup NEON
intrinsics. These are marked as Arch64-only in the ARM NEON
Reference, but it seems there are no reasons to not support them
in AArch32 mode. Please correct, if that is wrong.
That's what we generate with this patch applied:
vld2q_dup_f16:
vld2.16 {d0[], d2[]}, [r0]
vld2.16 {d1[], d3[]}, [r0]
vld3q_dup_f16:
vld3.16 {d0[], d2[], d4[]}, [r0]
vld3.16 {d1[], d3[], d5[]}, [r0]
vld4q_dup_f16:
vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
Differential Revision: https://reviews.llvm.org/D48439
llvm-svn: 335733
Diffstat (limited to 'llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp')
-rw-r--r-- | llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 83 |
1 files changed, 70 insertions, 13 deletions
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index d82bef5b759..439ba9e2c78 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -186,6 +186,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false}, { ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false}, +{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false}, +{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false}, +{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false}, +{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false}, + { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true}, @@ -213,6 +220,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true}, { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true}, { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, +{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true}, +{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true}, { ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, { ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, @@ -248,6 +261,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true}, { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true}, { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, +{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true}, +{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true}, { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, @@ -463,15 +482,31 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { bool DstIsDead = MI.getOperand(OpIdx).isDead(); unsigned DstReg = MI.getOperand(OpIdx++).getReg(); - unsigned D0, D1, D2, D3; - GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); - MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 1 && TableEntry->copyAllListRegs) - MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 2 && TableEntry->copyAllListRegs) - MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); - if (NumRegs > 3 && TableEntry->copyAllListRegs) - MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || + TableEntry->RealOpc == ARM::VLD2DUPd16x2 || + TableEntry->RealOpc == ARM::VLD2DUPd32x2) { + unsigned SubRegIndex; + if (RegSpc == EvenDblSpc) { + SubRegIndex = ARM::dsub_0; + } else { + assert(RegSpc == OddDblSpc && "Unexpected spacing!"); + SubRegIndex = ARM::dsub_1; + } + unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex); + unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, + &ARM::DPairSpcRegClass); + MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); + } else { + unsigned D0, D1, D2, D3; + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1 && TableEntry->copyAllListRegs) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2 && TableEntry->copyAllListRegs) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3 && TableEntry->copyAllListRegs) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } if (TableEntry->isUpdating) MIB.add(MI.getOperand(OpIdx++)); @@ -510,10 +545,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // has an extra operand that is a use of the super-register. Record the // operand index and skip over it. unsigned SrcOpIdx = 0; - if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || - RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || - RegSpc == SingleHighTSpc) - SrcOpIdx = OpIdx++; + if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 && + TableEntry->RealOpc != ARM::VLD2DUPd16x2 && + TableEntry->RealOpc != ARM::VLD2DUPd32x2) { + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || + RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc || + RegSpc == SingleHighTSpc) + SrcOpIdx = OpIdx++; + } // Copy the predicate operands. MIB.add(MI.getOperand(OpIdx++)); @@ -1674,6 +1713,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VLD4DUPd8Pseudo_UPD: case ARM::VLD4DUPd16Pseudo_UPD: case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD2DUPq8EvenPseudo: + case ARM::VLD2DUPq8OddPseudo: + case ARM::VLD2DUPq16EvenPseudo: + case ARM::VLD2DUPq16OddPseudo: + case ARM::VLD2DUPq32EvenPseudo: + case ARM::VLD2DUPq32OddPseudo: + case ARM::VLD3DUPq8EvenPseudo: + case ARM::VLD3DUPq8OddPseudo: + case ARM::VLD3DUPq16EvenPseudo: + case ARM::VLD3DUPq16OddPseudo: + case ARM::VLD3DUPq32EvenPseudo: + case ARM::VLD3DUPq32OddPseudo: + case ARM::VLD4DUPq8EvenPseudo: + case ARM::VLD4DUPq8OddPseudo: + case ARM::VLD4DUPq16EvenPseudo: + case ARM::VLD4DUPq16OddPseudo: + case ARM::VLD4DUPq32EvenPseudo: + case ARM::VLD4DUPq32OddPseudo: ExpandVLD(MBBI); return true; |