summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp68
1 files changed, 52 insertions, 16 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 70bb7d2f337..92db12abfdf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8480,10 +8480,15 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
///
/// This specifically targets cases where we end up with alternating between
/// the two inputs, and so can permute them into something that feeds a single
-/// UNPCK instruction.
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
assert(!isSingleInputShuffleMask(Mask) &&
"This routine should only be used when blending two inputs.");
assert(Mask.size() >= 2 && "Single element masks are invalid.");
@@ -8498,25 +8503,56 @@ static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
bool UnpackLo = NumLoInputs >= NumHiInputs;
- SmallVector<int, 32> V1Mask(Mask.size(), -1);
- SmallVector<int, 32> V2Mask(Mask.size(), -1);
- for (int i = 0; i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
+ auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
- // We only handle the case where V1 feeds even mask slots and V2 feeds odd
- // mask slots. We rely on canonicalization to ensure this is the case.
- if ((i % 2 == 0) != (Mask[i] < Size))
- return SDValue();
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
- SmallVectorImpl<int> &VMask = (i % 2 == 0) ? V1Mask : V2Mask;
- VMask[i / 2 + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size;
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+ DL, UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigNumElements = VT.getVectorNumElements();
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+ int Scale = ScalarSize / OrigScalarSize;
+ int NumElements = OrigNumElements / Scale;
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+ if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ return Unpack;
}
- V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
- V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
- return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V1,
- V2);
+ return SDValue();
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
OpenPOWER on IntegriCloud