diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 166 |
1 files changed, 166 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a1050381cfd..75caf9edea6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10755,6 +10755,136 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { return true; } +/// Handle case where shuffle sources are coming from the same 128-bit lane and +/// every lane can be represented as the same repeating mask - allowing us to +/// shuffle the sources with the repeating shuffle and then permute the result +/// to the destination lanes. +static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + + // On AVX2 we may be able to just shuffle the lowest elements and then + // broadcast the result. + if (Subtarget.hasAVX2()) { + for (unsigned BroadcastSize : {16, 32, 64}) { + if (BroadcastSize <= VT.getScalarSizeInBits()) + continue; + int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); + + // Attempt to match a repeating pattern every NumBroadcastElts, + // accounting for UNDEFs but only references the lowest 128-bit + // lane of the inputs. + auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) { + for (int i = 0; i != NumElts; i += NumBroadcastElts) + for (int j = 0; j != NumBroadcastElts; ++j) { + int M = Mask[i + j]; + if (M < 0) + continue; + int &R = RepeatMask[j]; + if (0 != ((M % NumElts) / NumLaneElts)) + return false; + else if (0 <= R && R != M) + return false; + else + R = M; + } + return true; + }; + + SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1); + if (!FindRepeatingBroadcastMask(RepeatMask)) + continue; + + // Shuffle the (lowest) repeated elements in place for broadcast. + SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); + + // Shuffle the actual broadcast. + SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1); + for (int i = 0; i != NumElts; i += NumBroadcastElts) + for (int j = 0; j != NumBroadcastElts; ++j) + BroadcastMask[i + j] = j; + return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), + BroadcastMask); + } + } + + // Bail if we already have a repeated lane shuffle mask. + SmallVector<int, 8> RepeatedShuffleMask((unsigned)NumLaneElts, -1); + if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) + return SDValue(); + + // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes + // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. + int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; + int NumSubLanes = NumLanes * SubLaneScale; + int NumSubLaneElts = NumLaneElts / SubLaneScale; + + // Check that all the sources are coming from the same lane and see if we + // can form a repeating shuffle mask (local to each lane). At the same time, + // determine the source sub-lane for each destination sub-lane. + int TopSrcSubLane = -1; + SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1); + SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + assert(0 <= M && M < 2 * NumElts); + + // Check that the local mask index is the same for every lane. We always do + // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask. + int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts; + int &RepeatM = RepeatedLaneMask[i % NumLaneElts]; + if (0 <= RepeatM && RepeatM != LocalM) + return SDValue(); + RepeatM = LocalM; + + // Check that the whole of each destination sub-lane comes from the same + // sub-lane, we need to calculate the source based off where the repeated + // lane mask will have left it. + int SrcLane = (M % NumElts) / NumLaneElts; + int SrcSubLane = (SrcLane * SubLaneScale) + + ((i % NumLaneElts) / NumSubLaneElts); + int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; + if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane) + return SDValue(); + Dst2SrcSubLane = SrcSubLane; + + // Track the top most source sub-lane - by setting the remaining to UNDEF + // we can greatly simplify shuffle matching. + TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); + } + assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && + "Unexpected source lane"); + + // Create a repeating shuffle mask for the entire vector. + SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1); + for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) { + int M = RepeatedLaneMask[i % NumLaneElts]; + if (M < 0) + continue; + int Lane = i / NumLaneElts; + RepeatedMask[i] = M + (Lane * NumLaneElts); + } + SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); + + // Shuffle each source sub-lane to its destination. + SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1); + for (int i = 0; i != NumElts; i += NumSubLaneElts) { + int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; + if (SrcSubLane < 0) + continue; + for (int j = 0; j != NumSubLaneElts; ++j) + SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); + } + + return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), + SubLaneMask); +} + static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -10829,6 +10959,12 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return V; + // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); @@ -10848,6 +10984,12 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return V; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single @@ -11001,6 +11143,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (isSingleInputShuffleMask(Mask)) { @@ -11096,6 +11244,12 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (isSingleInputShuffleMask(Mask)) { @@ -11165,6 +11319,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return V; + if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i16 // element types. @@ -11256,6 +11416,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return V; + if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i8 // element types. |