summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/PowerPC
diff options
context:
space:
mode:
authorNemanja Ivanovic <nemanja.i.ibm@gmail.com>2019-09-17 16:45:20 +0000
committerNemanja Ivanovic <nemanja.i.ibm@gmail.com>2019-09-17 16:45:20 +0000
commit1461fb6e783cb946b061f66689b419f74f7fad63 (patch)
treeea1e7a3d9569550c4a1c217572e7ae6e868cb7e8 /llvm/lib/Target/PowerPC
parent4e9082ef95db5d760df4cce00a4351fa122176d6 (diff)
downloadbcm5719-llvm-1461fb6e783cb946b061f66689b419f74f7fad63.tar.gz
bcm5719-llvm-1461fb6e783cb946b061f66689b419f74f7fad63.zip
[PowerPC] Exploit single instruction load-and-splat for word and doubleword
We currently produce a load, followed by (possibly a move for integers and) a splat as separate instructions. VSX has always had a splatting load for doublewords, but as of Power9, we have it for words as well. This patch just exploits these instructions. Differential revision: https://reviews.llvm.org/D63624 llvm-svn: 372139
Diffstat (limited to 'llvm/lib/Target/PowerPC')
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp97
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h12
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrAltivec.td6
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td14
4 files changed, 115 insertions, 14 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1ddc63d3200..e68012cee40 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1406,6 +1406,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
+ case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
}
return nullptr;
}
@@ -1778,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a splat of a single element that is suitable for input to
-/// VSPLTB/VSPLTH/VSPLTW.
+/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
- assert(N->getValueType(0) == MVT::v16i8 &&
- (EltSize == 1 || EltSize == 2 || EltSize == 4));
+ assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
+ EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
// The consecutive indices need to specify an element, not part of two
// different elements. So abandon ship early if this isn't the case.
@@ -2074,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
}
-/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
- SelectionDAG &DAG) {
+/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+/// appropriate for PPC mnemonics (which have a big endian bias - namely
+/// elements are counted from the left of the vector register).
+unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
assert(isSplatShuffleMask(SVOp, EltSize));
if (DAG.getDataLayout().isLittleEndian())
@@ -8185,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
Op0.getOperand(1));
}
+const SDValue *getNormalLoadInput(const SDValue &Op) {
+ const SDValue *InputLoad = &Op;
+ if (InputLoad->getOpcode() == ISD::BITCAST)
+ InputLoad = &InputLoad->getOperand(0);
+ if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+ InputLoad = &InputLoad->getOperand(0);
+ if (InputLoad->getOpcode() != ISD::LOAD)
+ return nullptr;
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+ return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -8307,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
SplatBitSize > 32) {
+
+ const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+ // Handle load-and-splat patterns as we have instructions that will do this
+ // in one go.
+ if (InputLoad && DAG.isSplatValue(Op, true)) {
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+
+ // We have handling for 4 and 8 byte elements.
+ unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+
+ // Checking for a single use of this load, we have to check for vector
+ // width (128 bits) / ElementSize uses (since each operand of the
+ // BUILD_VECTOR is a separate use of the value.
+ if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+ ((Subtarget.hasVSX() && ElementSize == 64) ||
+ (Subtarget.hasP9Vector() && ElementSize == 32))) {
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ LD->getBasePtr(), // Ptr
+ DAG.getValueType(Op.getValueType()) // VT
+ };
+ return
+ DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
+ DAG.getVTList(Op.getValueType(), MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+ }
+ }
+
// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
// lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
@@ -8792,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned ShiftElts, InsertAtByte;
bool Swap = false;
+
+ // If this is a load-and-splat, we can do that with a single instruction
+ // in some cases. However if the load has multiple uses, we don't want to
+ // combine it because that will just produce multiple loads.
+ const SDValue *InputLoad = getNormalLoadInput(V1);
+ if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
+ (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
+ InputLoad->hasOneUse()) {
+ bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
+ int SplatIdx =
+ PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
+
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+ // For 4-byte load-and-splat, we need Power9.
+ if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
+ uint64_t Offset = 0;
+ if (IsFourByte)
+ Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
+ else
+ Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+ SDValue BasePtr = LD->getBasePtr();
+ if (Offset != 0)
+ BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ BasePtr, DAG.getIntPtrConstant(Offset, dl));
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ BasePtr, // BasePtr
+ DAG.getValueType(Op.getValueType()) // VT
+ };
+ SDVTList VTL =
+ DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
+ SDValue LdSplt =
+ DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+ if (LdSplt.getValueType() != SVOp->getValueType(0))
+ LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
+ return LdSplt;
+ }
+ }
if (Subtarget.hasP9Vector() &&
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
isLittleEndian)) {
@@ -8868,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
- int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+ int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index f143d52870b..29cf75c62a1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -466,6 +466,10 @@ namespace llvm {
/// v2f32 value into the lower half of a VSR register.
LD_VSX_LH,
+ /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// instructions such as LXVDSX, LXVWSX.
+ LD_SPLAT,
+
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
/// Maps directly to an stxvd2x instruction that will be preceded by
/// an xxswapd.
@@ -574,9 +578,11 @@ namespace llvm {
bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
unsigned &InsertAtByte, bool &Swap, bool IsLE);
- /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
- /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
- unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+ /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+ /// appropriate for PPC mnemonics (which have a big endian bias - namely
+ /// elements are counted from the left of the vector register).
+ unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+ SelectionDAG &DAG);
/// get_VSPLTI_elt - If this is a build_vector of constants which can be
/// formed by using a vspltis[bhw] instruction of the specified element
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 8176c5120a8..4cef5b7eb99 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N));
}]>;
def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
}], VSPLTB_get_imm>;
def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N));
}]>;
def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
}], VSPLTH_get_imm>;
def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N));
}]>;
def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index f6e2a3259a2..e883ede6543 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -62,6 +62,10 @@ def SDT_PPCfpexth : SDTypeProfile<1, 2, [
SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2>
]>;
+def SDT_PPCldsplat : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+
// Little-endian-specific nodes.
def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@@ -105,6 +109,8 @@ def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
string asmstr, InstrItinClass itin, Intrinsic Int,
@@ -3931,6 +3937,10 @@ let AddedComplexity = 400 in {
(XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+ def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
+ (v2f64 (LXVDSX xoaddr:$A))>;
+ def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
+ (v2i64 (LXVDSX xoaddr:$A))>;
// Build vectors of floating point converted to i64.
def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -4180,6 +4190,10 @@ let AddedComplexity = 400 in {
(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
(DFLOADf32 iaddrX4:$A),
VSFRC)), 0))>;
+ def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
+ (v4f32 (LXVWSX xoaddr:$A))>;
+ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
+ (v4i32 (LXVWSX xoaddr:$A))>;
}
let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
OpenPOWER on IntegriCloud