diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-28 08:08:15 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-28 08:08:15 +0000 |
commit | c15d2178313a3b904158a472e4aa1f118e040395 (patch) | |
tree | c4106e39019e725374e21f045e25343eb5a59c90 /llvm/lib | |
parent | 36b2c0a683ccfafbaad948ced64b6912aa5bf826 (diff) | |
download | bcm5719-llvm-c15d2178313a3b904158a472e4aa1f118e040395.tar.gz bcm5719-llvm-c15d2178313a3b904158a472e4aa1f118e040395.zip |
[X86][SSE] Added support for combining target shuffles to (V)PSHUFD/VPERMILPD/VPERMILPS immediate permutes
This patch allows target shuffles to be combined to single input immediate permute instructions - (V)PSHUFD/VPERMILPD/VPERMILPS - allowing more general pattern matching than what we current do and improves the likelihood of memory folding compared to existing patterns which tend to reuse the input in multiple arguments.
Further permute instructions (V)PSHUFLW/(V)PSHUFHW/(V)PERMQ/(V)PERMPD may be added in the future but its proven tricky to create tests cases for them so far. (V)PSHUFLW/(V)PSHUFHW is already handled quite well in combineTargetShuffle so it may be that removing some of that code may allow us to perform more of the combining in one place without duplication.
Differential Revision: http://reviews.llvm.org/D21148
llvm-svn: 273999
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 106 |
1 files changed, 101 insertions, 5 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f9ebaea4d12..a5ee5372e99 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7148,8 +7148,7 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, - SelectionDAG &DAG) { +static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); @@ -7161,7 +7160,12 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; - return DAG.getConstant(Imm, DL, MVT::i8); + return Imm; +} + +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, + SelectionDAG &DAG) { + return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. @@ -24529,7 +24533,8 @@ static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT) { - bool FloatDomain = SrcVT.isFloatingPoint(); + bool FloatDomain = SrcVT.isFloatingPoint() || + (!Subtarget.hasAVX2() && SrcVT.is256BitVector()); // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. if (!FloatDomain && SrcVT.is128BitVector() && @@ -24607,6 +24612,83 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, return false; } +// Attempt to match a combined shuffle mask against supported unary immediate +// permute instructions. +// TODO: Investigate sharing more of this with shuffle lowering. +static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + // Ensure we don't contain any zero elements. + for (int M : Mask) { + if (M == SM_SentinelZero) + return false; + assert(SM_SentinelUndef <= M && M < (int)Mask.size() && + "Expected unary shuffle"); + } + + // We only support permutation of 32/64 bit elements. + // TODO - support PSHUFLW/PSHUFHW. + unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size(); + if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) + return false; + MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); + + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + bool FloatDomain = SrcVT.isFloatingPoint(); + if (FloatDomain && !Subtarget.hasAVX()) + return false; + + // Pre-AVX2 we must use float shuffles on 256-bit vectors. + if (SrcVT.is256BitVector() && !Subtarget.hasAVX2()) + FloatDomain = true; + + // TODO - support LaneCrossing for AVX2 PERMQ/PERMPD + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) + return false; + + // VPERMILPD can permute with a non-repeating shuffle. + if (FloatDomain && MaskScalarSizeInBits == 64) { + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } + return true; + } + + // We need a repeating shuffle mask for VPERMILPS/PSHUFD. + SmallVector<int, 4> RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) + return false; + + // Narrow the repeated mask for 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) { + WordMask.clear(); + for (int M : RepeatedMask) { + if (M == SM_SentinelUndef) { + WordMask.append(2, SM_SentinelUndef); + continue; + } + WordMask.push_back((M * 2) + 0); + WordMask.push_back((M * 2) + 1); + } + } + + Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; +} + // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. @@ -24708,7 +24790,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, // Attempt to match the mask against known shuffle patterns. MVT ShuffleVT; - unsigned Shuffle; + unsigned Shuffle, PermuteImm; if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) @@ -24722,6 +24804,20 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, return true; } + if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT, + PermuteImm)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, + DAG.getConstant(PermuteImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! |