diff options
| author | Nemanja Ivanovic <nemanja.i.ibm@gmail.com> | 2019-09-17 16:45:20 +0000 |
|---|---|---|
| committer | Nemanja Ivanovic <nemanja.i.ibm@gmail.com> | 2019-09-17 16:45:20 +0000 |
| commit | 1461fb6e783cb946b061f66689b419f74f7fad63 (patch) | |
| tree | ea1e7a3d9569550c4a1c217572e7ae6e868cb7e8 /llvm/lib/Target/PowerPC | |
| parent | 4e9082ef95db5d760df4cce00a4351fa122176d6 (diff) | |
| download | bcm5719-llvm-1461fb6e783cb946b061f66689b419f74f7fad63.tar.gz bcm5719-llvm-1461fb6e783cb946b061f66689b419f74f7fad63.zip | |
[PowerPC] Exploit single instruction load-and-splat for word and doubleword
We currently produce a load, followed by (possibly a move for integers and) a
splat as separate instructions. VSX has always had a splatting load for
doublewords, but as of Power9, we have it for words as well. This patch just
exploits these instructions.
Differential revision: https://reviews.llvm.org/D63624
llvm-svn: 372139
Diffstat (limited to 'llvm/lib/Target/PowerPC')
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 97 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.h | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrAltivec.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrVSX.td | 14 |
4 files changed, 115 insertions, 14 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 1ddc63d3200..e68012cee40 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1406,6 +1406,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; + case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; } return nullptr; } @@ -1778,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to -/// VSPLTB/VSPLTH/VSPLTW. +/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.). bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { - assert(N->getValueType(0) == MVT::v16i8 && - (EltSize == 1 || EltSize == 2 || EltSize == 4)); + assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) && + EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes"); // The consecutive indices need to specify an element, not part of two // different elements. So abandon ship early if this isn't the case. @@ -2074,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, } -/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the -/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. -unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, - SelectionDAG &DAG) { +/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is +/// appropriate for PPC mnemonics (which have a big endian bias - namely +/// elements are counted from the left of the vector register). +unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); if (DAG.getDataLayout().isLittleEndian()) @@ -8185,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } +const SDValue *getNormalLoadInput(const SDValue &Op) { + const SDValue *InputLoad = &Op; + if (InputLoad->getOpcode() == ISD::BITCAST) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() != ISD::LOAD) + return nullptr; + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + return ISD::isNormalLoad(LD) ? InputLoad : nullptr; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -8307,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) { + + const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + // Handle load-and-splat patterns as we have instructions that will do this + // in one go. + if (InputLoad && DAG.isSplatValue(Op, true)) { + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + + // We have handling for 4 and 8 byte elements. + unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits(); + + // Checking for a single use of this load, we have to check for vector + // width (128 bits) / ElementSize uses (since each operand of the + // BUILD_VECTOR is a separate use of the value. + if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) && + ((Subtarget.hasVSX() && ElementSize == 64) || + (Subtarget.hasP9Vector() && ElementSize == 32))) { + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(Op.getValueType()) // VT + }; + return + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, + DAG.getVTList(Op.getValueType(), MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + } + } + // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. @@ -8792,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned ShiftElts, InsertAtByte; bool Swap = false; + + // If this is a load-and-splat, we can do that with a single instruction + // in some cases. However if the load has multiple uses, we don't want to + // combine it because that will just produce multiple loads. + const SDValue *InputLoad = getNormalLoadInput(V1); + if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && + (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && + InputLoad->hasOneUse()) { + bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4); + int SplatIdx = + PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); + // For 4-byte load-and-splat, we need Power9. + if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { + uint64_t Offset = 0; + if (IsFourByte) + Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4; + else + Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; + SDValue BasePtr = LD->getBasePtr(); + if (Offset != 0) + BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + BasePtr, DAG.getIntPtrConstant(Offset, dl)); + SDValue Ops[] = { + LD->getChain(), // Chain + BasePtr, // BasePtr + DAG.getValueType(Op.getValueType()) // VT + }; + SDVTList VTL = + DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other); + SDValue LdSplt = + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL, + Ops, LD->getMemoryVT(), LD->getMemOperand()); + if (LdSplt.getValueType() != SVOp->getValueType(0)) + LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt); + return LdSplt; + } + } if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -8868,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { - int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); + int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG); SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index f143d52870b..29cf75c62a1 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -466,6 +466,10 @@ namespace llvm { /// v2f32 value into the lower half of a VSR register. LD_VSX_LH, + /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// instructions such as LXVDSX, LXVWSX. + LD_SPLAT, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. @@ -574,9 +578,11 @@ namespace llvm { bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE); - /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the - /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. - unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); + /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is + /// appropriate for PPC mnemonics (which have a big endian bias - namely + /// elements are counted from the left of the vector register). + unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG); /// get_VSPLTI_elt - If this is a build_vector of constants which can be /// formed by using a vspltis[bhw] instruction of the specified element diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 8176c5120a8..4cef5b7eb99 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N)); }]>; def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1); }], VSPLTB_get_imm>; def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N)); }]>; def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2); }], VSPLTH_get_imm>; def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N)); + return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N)); }]>; def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index f6e2a3259a2..e883ede6543 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -62,6 +62,10 @@ def SDT_PPCfpexth : SDTypeProfile<1, 2, [ SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2> ]>; +def SDT_PPCldsplat : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; + // Little-endian-specific nodes. def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> @@ -105,6 +109,8 @@ def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>; def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -3931,6 +3937,10 @@ let AddedComplexity = 400 in { (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + def : Pat<(v2f64 (PPCldsplat xoaddr:$A)), + (v2f64 (LXVDSX xoaddr:$A))>; + def : Pat<(v2i64 (PPCldsplat xoaddr:$A)), + (v2i64 (LXVDSX xoaddr:$A))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), @@ -4180,6 +4190,10 @@ let AddedComplexity = 400 in { (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0))>; + def : Pat<(v4f32 (PPCldsplat xoaddr:$A)), + (v4f32 (LXVWSX xoaddr:$A))>; + def : Pat<(v4i32 (PPCldsplat xoaddr:$A)), + (v4i32 (LXVWSX xoaddr:$A))>; } let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { |

