diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-12-18 14:26:02 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-12-18 14:26:02 +0000 |
commit | e940daf5329275e57a3bde6cba1ed2d345292764 (patch) | |
tree | 6f031fafcf910f2032dc016f1c75f3fd8c6eb6e0 /llvm/lib | |
parent | ccfbf384ba6622cb6610ff3cb265ac3ba88ee019 (diff) | |
download | bcm5719-llvm-e940daf5329275e57a3bde6cba1ed2d345292764.tar.gz bcm5719-llvm-e940daf5329275e57a3bde6cba1ed2d345292764.zip |
[X86][SSE] Add support for combining target shuffles to SHUFPS.
As discussed on D27692, the next step will be to allow cross-domain shuffles once the combined shuffle depth passes a certain point.
llvm-svn: 290064
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 110 |
1 files changed, 108 insertions, 2 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 351a22c46fe..e29c7cd6e3b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7779,6 +7779,42 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } +/// Test whether a target shuffle mask is equivalent within each sub-lane. +/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. +static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + RepeatedMask.assign(LaneSize, SM_SentinelUndef); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); + if (Mask[i] == SM_SentinelUndef) + continue; + if (Mask[i] == SM_SentinelZero) { + if (!isUndefOrZero(RepeatedMask[i % LaneSize])) + return false; + RepeatedMask[i % LaneSize] = SM_SentinelZero; + continue; + } + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + // Adjust second vector indices to start at LaneSize instead of Size. + int LocalM = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; + if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = LocalM; + else if (RepeatedMask[i % LaneSize] != LocalM) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -26274,6 +26310,50 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } + // Attempt to combine to SHUFPS. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || + (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + SmallVector<int, 4> RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { + int M0 = RepeatedMask[Offset]; + int M1 = RepeatedMask[Offset + 1]; + + if (isUndefInRange(RepeatedMask, Offset, 2)) { + return DAG.getUNDEF(MaskVT); + } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { + S0 = (SM_SentinelUndef == M0 ? -1 : 0); + S1 = (SM_SentinelUndef == M1 ? -1 : 1); + return getZeroVector(MaskVT, Subtarget, DAG, DL); + } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V1; + } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V2; + } + + return SDValue(); + }; + + int ShufMask[4] = {-1, -1, -1, -1}; + SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); + SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); + + if (Lo && Hi) { + V1 = Lo; + V2 = Hi; + Shuffle = X86ISD::SHUFP; + ShuffleVT = MaskVT; + PermuteImm = getV4X86ShuffleImm(ShufMask); + return true; + } + } + } + return false; } @@ -27294,7 +27374,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; - switch (N.getOpcode()) { + unsigned Opcode = N.getOpcode(); + switch (Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: @@ -27369,6 +27450,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } + case X86ISD::MOVSD: + case X86ISD::MOVSS: { + bool isFloat = VT.isFloatingPoint(); + SDValue V0 = peekThroughBitcasts(N->getOperand(0)); + SDValue V1 = peekThroughBitcasts(N->getOperand(1)); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); + bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); + bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); + assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + + // We often lower to MOVSD/MOVSS from integer as well as native float + // types; remove unnecessary domain-crossing bitcasts if we can to make it + // easier to combine shuffles later on. We've already accounted for the + // domain switching cost when we decided to lower with it. + if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { + MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) + : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); + } + + return SDValue(); + } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); @@ -28275,7 +28381,7 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); - // Check if the first operand is all zeros.This situation only + // Check if the first operand is all zeros.This situation only // applies to avx512. if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) { //Invert the cond to not(cond) : xor(op,allones)=not(op) |