summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp123
1 files changed, 115 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 16b3a6c26ac..8359e37b023 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10143,7 +10143,8 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10163,6 +10164,12 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
PermuteMask[i] = Mask[i] % Size;
}
+ // If only immediate blends, then bail if the blend mask can't be widened to
+ // i16.
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
+ return SDValue();
+
SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
@@ -10233,6 +10240,92 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
}
+/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
+/// permuting the elements of the result in place.
+static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
+ (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
+ (VT.is512BitVector() && !Subtarget.hasBWI()))
+ return SDValue();
+
+ // We don't currently support lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ int Scale = VT.getScalarSizeInBits() / 8;
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElts / NumLanes;
+
+ // Determine range of mask elts.
+ bool Blend1 = true;
+ bool Blend2 = true;
+ std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
+ std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts) {
+ Blend1 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range1.first = std::min(Range1.first, M);
+ Range1.second = std::max(Range1.second, M);
+ } else {
+ M -= NumElts;
+ Blend2 &= (M == (Lane + Elt));
+ assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
+ M = M % NumEltsPerLane;
+ Range2.first = std::min(Range2.first, M);
+ Range2.second = std::max(Range2.second, M);
+ }
+ }
+ }
+
+ // Bail if we don't need both elements.
+ // TODO - it might be worth doing this for unary shuffles if the permute
+ // can be widened.
+ if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
+ !(0 <= Range2.first && Range2.second < NumEltsPerLane))
+ return SDValue();
+
+ if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
+ return SDValue();
+
+ // Rotate the 2 ops so we can access both ranges, then permute the result.
+ auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ SDValue Rotate = DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
+ DAG.getBitcast(ByteVT, Lo),
+ DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+ SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
+ for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
+ for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
+ int M = Mask[Lane + Elt];
+ if (M < 0)
+ continue;
+ if (M < NumElts)
+ PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
+ else
+ PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
+ }
+ }
+ return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
+ };
+
+ // Check if the ranges are small enough to rotate from either direction.
+ if (Range2.second < Range1.first)
+ return RotateAndPermute(V1, V2, Range1.first, 0);
+ if (Range1.second < Range2.first)
+ return RotateAndPermute(V2, V1, Range2.first, NumElts);
+ return SDValue();
+}
+
/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
@@ -10257,18 +10350,26 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
BlendMask[i] = i + Size;
}
- // Try to lower with the simpler initial blend/unpack strategies unless one of
- // the input shuffles would be a no-op. We prefer to shuffle inputs as the
- // shuffle may be able to fold with a load or other benefit. However, when
- // we'll have to do 2x as many shuffles in order to achieve this,
- // blending/unpacking first is a better strategy.
+ // Try to lower with the simpler initial blend/unpack/rotate strategies unless
+ // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
+ // the shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
+ // pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
- if (SDValue BlendPerm =
- lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ // Only prefer immediate blends to unpack/rotate.
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, VT, V1, V2, Mask, DAG, true))
return BlendPerm;
if (SDValue UnpackPerm =
lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
return UnpackPerm;
+ if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return RotatePerm;
+ // Unpack/rotate failed - try again with variable blends.
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
}
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
@@ -13104,6 +13205,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+
+ // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+ // PALIGNR will be cheaper than the second PSHUFB+OR.
+ if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
}
return PSHUFB;
OpenPOWER on IntegriCloud