diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-11-12 21:12:38 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-11-12 21:12:38 +0000 |
| commit | e565e5a9621de42e7b82ad50803fe03beafa0334 (patch) | |
| tree | ab5944465c43952aa2175632d4b8f8a74916aa6d /llvm/lib/Target | |
| parent | a946acd58534764267bc8e3f005f70f52a51c19e (diff) | |
| download | bcm5719-llvm-e565e5a9621de42e7b82ad50803fe03beafa0334.tar.gz bcm5719-llvm-e565e5a9621de42e7b82ad50803fe03beafa0334.zip | |
[X86][SSE] Add lowerVectorShuffleAsByteRotateAndPermute (PR39387)
This patch adds the ability to use a PALIGNR to rotate a pair of inputs to select a range containing all the referenced elements, followed by a single input permute to put them in the right location.
Differential Revision: https://reviews.llvm.org/D54267
llvm-svn: 346706
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 123 |
1 files changed, 115 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 16b3a6c26ac..8359e37b023 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10143,7 +10143,8 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { + SelectionDAG &DAG, + bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector<int, 32> BlendMask(Mask.size(), -1); @@ -10163,6 +10164,12 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, PermuteMask[i] = Mask[i] % Size; } + // If only immediate blends, then bail if the blend mask can't be widened to + // i16. + unsigned EltSize = VT.getScalarSizeInBits(); + if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) + return SDValue(); + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } @@ -10233,6 +10240,92 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } +/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then +/// permuting the elements of the result in place. +static SDValue lowerVectorShuffleAsByteRotateAndPermute( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { + if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || + (VT.is256BitVector() && !Subtarget.hasAVX2()) || + (VT.is512BitVector() && !Subtarget.hasBWI())) + return SDValue(); + + // We don't currently support lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(VT, Mask)) + return SDValue(); + + int Scale = VT.getScalarSizeInBits() / 8; + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = VT.getVectorNumElements(); + int NumEltsPerLane = NumElts / NumLanes; + + // Determine range of mask elts. + bool Blend1 = true; + bool Blend2 = true; + std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN); + std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN); + for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { + for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { + int M = Mask[Lane + Elt]; + if (M < 0) + continue; + if (M < NumElts) { + Blend1 &= (M == (Lane + Elt)); + assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); + M = M % NumEltsPerLane; + Range1.first = std::min(Range1.first, M); + Range1.second = std::max(Range1.second, M); + } else { + M -= NumElts; + Blend2 &= (M == (Lane + Elt)); + assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); + M = M % NumEltsPerLane; + Range2.first = std::min(Range2.first, M); + Range2.second = std::max(Range2.second, M); + } + } + } + + // Bail if we don't need both elements. + // TODO - it might be worth doing this for unary shuffles if the permute + // can be widened. + if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || + !(0 <= Range2.first && Range2.second < NumEltsPerLane)) + return SDValue(); + + if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) + return SDValue(); + + // Rotate the 2 ops so we can access both ranges, then permute the result. + auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + SDValue Rotate = DAG.getBitcast( + VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), + DAG.getBitcast(ByteVT, Lo), + DAG.getConstant(Scale * RotAmt, DL, MVT::i8))); + SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef); + for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { + for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { + int M = Mask[Lane + Elt]; + if (M < 0) + continue; + if (M < NumElts) + PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); + else + PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); + } + } + return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); + }; + + // Check if the ranges are small enough to rotate from either direction. + if (Range2.second < Range1.first) + return RotateAndPermute(V1, V2, Range1.first, 0); + if (Range1.second < Range2.first) + return RotateAndPermute(V2, V1, Range2.first, NumElts); + return SDValue(); +} + /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// @@ -10257,18 +10350,26 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( BlendMask[i] = i + Size; } - // Try to lower with the simpler initial blend/unpack strategies unless one of - // the input shuffles would be a no-op. We prefer to shuffle inputs as the - // shuffle may be able to fold with a load or other benefit. However, when - // we'll have to do 2x as many shuffles in order to achieve this, - // blending/unpacking first is a better strategy. + // Try to lower with the simpler initial blend/unpack/rotate strategies unless + // one of the input shuffles would be a no-op. We prefer to shuffle inputs as + // the shuffle may be able to fold with a load or other benefit. However, when + // we'll have to do 2x as many shuffles in order to achieve this, a 2-input + // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { - if (SDValue BlendPerm = - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + // Only prefer immediate blends to unpack/rotate. + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; if (SDValue UnpackPerm = lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; + if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return RotatePerm; + // Unpack/rotate failed - try again with variable blends. + if (SDValue BlendPerm = + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + return BlendPerm; } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); @@ -13104,6 +13205,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the + // PALIGNR will be cheaper than the second PSHUFB+OR. + if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute( + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return V; } return PSHUFB; |

