summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp183
1 files changed, 133 insertions, 50 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 65954f5bfc2..2042ee4c920 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13334,79 +13334,162 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
-/// This will only succeed when the result of fixing the 128-bit lanes results
-/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
-/// each 128-bit lanes. This handles many cases where we can quickly blend away
-/// the lane crosses early and then use simpler shuffles within each lane.
+/// This attempts to create a repeated lane shuffle where each lane uses one
+/// or two of the lanes of the inputs. The lanes of the input vectors are
+/// shuffled in one or two independent shuffles to get the lanes into the
+/// position needed by the final shuffle.
///
-/// FIXME: It might be worthwhile at some point to support this without
-/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
-/// in x86 only floating point has interesting non-repeating shuffles, and even
-/// those are still *marginally* more expensive.
+/// FIXME: This should be generalized to 512-bit shuffles.
static SDValue lowerVectorShuffleByMerging128BitLanes(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask))
+ return SDValue();
+
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
int NumLanes = Size / LaneSize;
- assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+ assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
+
+ SmallVector<int, 16> RepeatMask(LaneSize, -1);
+ int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
+
+ // First pass will try to fill in the RepeatMask from lanes that need two
+ // sources.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Srcs[2] = { -1, -1 };
+ SmallVector<int, 16> InLaneMask(LaneSize, -1);
+ for (int i = 0; i != LaneSize; ++i) {
+ int M = Mask[(Lane * LaneSize) + i];
+ if (M < 0)
+ continue;
+ // Determine which of the 4 possible input lanes (2 from each source)
+ // this element comes from. Assign that as one of the sources for this
+ // lane. We can assign up to 2 sources for this lane. If we run out
+ // sources we can't do anything.
+ int LaneSrc = M / LaneSize;
+ int Src;
+ if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
+ Src = 0;
+ else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
+ Src = 1;
+ else
+ return SDValue();
- // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
- // check whether the in-128-bit lane shuffles share a repeating pattern.
- SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
- SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
- for (int i = 0; i < Size; ++i) {
- if (Mask[i] < 0)
+ Srcs[Src] = LaneSrc;
+ InLaneMask[i] = (M % LaneSize) + Src * Size;
+ }
+
+ // If this lane has two sources, see if it fits with the repeat mask so far.
+ if (Srcs[1] < 0)
continue;
- int j = i / LaneSize;
+ LaneSrcs[Lane][0] = Srcs[0];
+ LaneSrcs[Lane][1] = Srcs[1];
- if (Lanes[j] < 0) {
- // First entry we've seen for this lane.
- Lanes[j] = Mask[i] / LaneSize;
- } else if (Lanes[j] != Mask[i] / LaneSize) {
- // This doesn't match the lane selected previously!
- return SDValue();
+ auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
+ assert(M1.size() == M2.size() && "Unexpected mask size");
+ for (int i = 0, e = M1.size(); i != e; ++i)
+ if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
+ return false;
+ return true;
+ };
+
+ auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
+ assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
+ for (int i = 0, e = MergedMask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
+ "Unexpected mask element");
+ MergedMask[i] = M;
+ }
+ };
+
+ if (MatchMasks(InLaneMask, RepeatMask)) {
+ // Merge this lane mask into the final repeat mask.
+ MergeMasks(InLaneMask, RepeatMask);
+ continue;
}
- // Check that within each lane we have a consistent shuffle mask.
- int k = i % LaneSize;
- if (InLaneMask[k] < 0) {
- InLaneMask[k] = Mask[i] % LaneSize;
- } else if (InLaneMask[k] != Mask[i] % LaneSize) {
- // This doesn't fit a repeating in-lane mask.
- return SDValue();
+ // Didn't find a match. Swap the operands and try again.
+ std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
+ ShuffleVectorSDNode::commuteMask(InLaneMask);
+
+ if (MatchMasks(InLaneMask, RepeatMask)) {
+ // Merge this lane mask into the final repeat mask.
+ MergeMasks(InLaneMask, RepeatMask);
+ continue;
}
+
+ // Couldn't find a match with the operands in either order.
+ return SDValue();
}
- // First shuffle the lanes into place.
- MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
- VT.getSizeInBits() / 64);
- SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
- for (int i = 0; i < NumLanes; ++i)
- if (Lanes[i] >= 0) {
- LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
- LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ // Now handle any lanes with only one source.
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ // If this lane has already been processed, skip it.
+ if (LaneSrcs[Lane][0] >= 0)
+ continue;
+
+ for (int i = 0; i != LaneSize; ++i) {
+ int M = Mask[(Lane * LaneSize) + i];
+ if (M < 0)
+ continue;
+
+ // If RepeatMask isn't defined yet we can define it ourself.
+ if (RepeatMask[i] < 0)
+ RepeatMask[i] = M % LaneSize;
+
+ if (RepeatMask[i] < Size) {
+ if (RepeatMask[i] != M % LaneSize)
+ return SDValue();
+ LaneSrcs[Lane][0] = M / LaneSize;
+ } else {
+ if (RepeatMask[i] != ((M % LaneSize) + Size))
+ return SDValue();
+ LaneSrcs[Lane][1] = M / LaneSize;
+ }
}
- V1 = DAG.getBitcast(LaneVT, V1);
- V2 = DAG.getBitcast(LaneVT, V2);
- SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+ if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
+ return SDValue();
+ }
- // Cast it back to the type we actually want.
- LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
+ SmallVector<int, 16> NewMask(Size, -1);
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Src = LaneSrcs[Lane][0];
+ for (int i = 0; i != LaneSize; ++i) {
+ int M = -1;
+ if (Src >= 0)
+ M = Src * LaneSize + i;
+ NewMask[Lane * LaneSize + i] = M;
+ }
+ }
+ SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
- // Now do a simple shuffle that isn't lane crossing.
- SmallVector<int, 8> NewMask((unsigned)Size, -1);
- for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0)
- NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
- assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
- "Must not introduce lane crosses at this point!");
+ for (int Lane = 0; Lane != NumLanes; ++Lane) {
+ int Src = LaneSrcs[Lane][1];
+ for (int i = 0; i != LaneSize; ++i) {
+ int M = -1;
+ if (Src >= 0)
+ M = Src * LaneSize + i;
+ NewMask[Lane * LaneSize + i] = M;
+ }
+ }
+ SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
- return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+ for (int i = 0; i != Size; ++i) {
+ NewMask[i] = RepeatMask[i % LaneSize];
+ if (NewMask[i] < 0)
+ continue;
+
+ NewMask[i] += (i / LaneSize) * LaneSize;
+ }
+ return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
OpenPOWER on IntegriCloud