diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-09-27 20:19:53 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-09-27 20:19:53 +0000 |
| commit | 870007b4f892794051675c5bfe8dc31dc6be017b (patch) | |
| tree | 71fbf9a5a6342af456b48f394661dfd726b877b2 /llvm/lib/Target | |
| parent | 967d438439ac23afe4f6c684a27ac15daff11872 (diff) | |
| download | bcm5719-llvm-870007b4f892794051675c5bfe8dc31dc6be017b.tar.gz bcm5719-llvm-870007b4f892794051675c5bfe8dc31dc6be017b.zip | |
[X86][SSE] Pull out variable shuffle mask combine logic. NFCI.
Hopefully this will make it easier to vary the combine depth threshold per-target.
llvm-svn: 314337
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 23 |
1 files changed, 13 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 02ce5eb34f2..e78f70cc52c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27779,12 +27779,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Depth < 2) return SDValue(); + // Depth threshold above which we can efficiently use variable mask shuffles. + // TODO This should probably be target specific. + bool AllowVariableMask = (Depth >= 3) || HasVariableMask; + bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasAVX512() && @@ -27805,7 +27809,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && AllowVariableMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || @@ -27833,7 +27837,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } // If we have a dual input lane-crossing shuffle then lower to VPERMV3. - if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || @@ -27859,7 +27863,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. - if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && MaskContainsZeros && AllowVariableMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); @@ -27890,7 +27894,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. - if (UnaryShuffle && HasVariableMask && !MaskContainsZeros && + if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector<SDValue, 16> VPermIdx; @@ -27910,7 +27914,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. - if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() && + if (AllowVariableMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. @@ -27952,7 +27956,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && AllowVariableMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { @@ -27970,7 +27974,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, continue; } M = Ratio * M + i % Ratio; - assert ((M / 16) == (i / 16) && "Lane crossing detected"); + assert((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); @@ -27986,8 +27990,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. - if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() && - Subtarget.hasXOP()) { + if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) |

