diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-08-05 17:36:14 +0000 | 
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-08-05 17:36:14 +0000 | 
| commit | 69b6a7083466d8d2fd7ae39782ebb6e34f73b50e (patch) | |
| tree | ff815f959f79f3dc9440f54722ebfd7d43c3fab3 /llvm | |
| parent | eea45bc4781686fa99e72eeb1fc0a6d7d6d700b8 (diff) | |
| download | bcm5719-llvm-69b6a7083466d8d2fd7ae39782ebb6e34f73b50e.tar.gz bcm5719-llvm-69b6a7083466d8d2fd7ae39782ebb6e34f73b50e.zip  | |
[X86][SSE] Add initial support for 2 input target shuffle combining.
At the moment only the INSERTPS matching can actually use 2 inputs but the plumbing is now in place.
llvm-svn: 277839
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 120 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/buildvec-insertvec.ll | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll | 6 | 
3 files changed, 71 insertions, 66 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4d1dbe13293..baf6d06da7d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25077,20 +25077,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,  /// into either a single instruction if there is a special purpose instruction  /// for this operation, or into a PSHUFB instruction which is a fully general  /// instruction but should only be used to replace chains over a certain depth. -static bool combineX86ShuffleChain(SDValue Input, SDValue Root, +static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,                                     ArrayRef<int> BaseMask, int Depth,                                     bool HasVariableMask, SelectionDAG &DAG,                                     TargetLowering::DAGCombinerInfo &DCI,                                     const X86Subtarget &Subtarget) {    assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); +  assert((Inputs.size() == 1 || Inputs.size() == 2) && +         "Unexpected number of shuffle inputs!"); -  // Find the operand that enters the chain. Note that multiple uses are OK -  // here, we're not going to remove the operand we find. -  Input = peekThroughBitcasts(Input); +  // Find the inputs that enter the chain. Note that multiple uses are OK +  // here, we're not going to remove the operands we find. +  bool UnaryShuffle = (Inputs.size() == 1); +  SDValue V1 = peekThroughBitcasts(Inputs[0]); +  SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1])); -  MVT VT = Input.getSimpleValueType(); +  MVT VT1 = V1.getSimpleValueType(); +  MVT VT2 = V2.getSimpleValueType();    MVT RootVT = Root.getSimpleValueType(); -  assert(VT.getSizeInBits() == RootVT.getSizeInBits() && +  assert(VT1.getSizeInBits() == RootVT.getSizeInBits() && +         VT2.getSizeInBits() == RootVT.getSizeInBits() &&           "Vector size mismatch");    SDLoc DL(Root); @@ -25099,14 +25105,14 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,    unsigned NumBaseMaskElts = BaseMask.size();    if (NumBaseMaskElts == 1) {      assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); -    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), +    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),                    /*AddTo*/ true);      return true;    }    unsigned RootSizeInBits = RootVT.getSizeInBits();    unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; -  bool FloatDomain = VT.isFloatingPoint() || +  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||                       (RootVT.is256BitVector() && !Subtarget.hasAVX2());    // Don't combine if we are a AVX512/EVEX target and the mask element size @@ -25124,7 +25130,8 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,    // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.    // Handle 128-bit lane shuffles of 256-bit vectors. -  if (RootVT.is256BitVector() && NumBaseMaskElts == 2 && +  // TODO - this should support binary shuffles. +  if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&        !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {      if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)        return false; // Nothing to do! @@ -25133,7 +25140,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); -    Res = DAG.getBitcast(ShuffleVT, Input); +    Res = DAG.getBitcast(ShuffleVT, V1);      DCI.AddToWorklist(Res.getNode());      Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,                        DAG.getUNDEF(ShuffleVT), @@ -25168,45 +25175,47 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,    MVT ShuffleVT;    unsigned Shuffle, PermuteImm; -  if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) { -    if (Depth == 1 && Root.getOpcode() == Shuffle) -      return false; // Nothing to do! -    Res = DAG.getBitcast(ShuffleVT, Input); -    DCI.AddToWorklist(Res.getNode()); -    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); -    DCI.AddToWorklist(Res.getNode()); -    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), -                  /*AddTo*/ true); -    return true; -  } +  if (UnaryShuffle) { +    if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) { +      if (Depth == 1 && Root.getOpcode() == Shuffle) +        return false; // Nothing to do! +      Res = DAG.getBitcast(ShuffleVT, V1); +      DCI.AddToWorklist(Res.getNode()); +      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); +      DCI.AddToWorklist(Res.getNode()); +      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), +                    /*AddTo*/ true); +      return true; +    } -  if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT, -                                     PermuteImm)) { -    if (Depth == 1 && Root.getOpcode() == Shuffle) -      return false; // Nothing to do! -    Res = DAG.getBitcast(ShuffleVT, Input); -    DCI.AddToWorklist(Res.getNode()); -    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, -                      DAG.getConstant(PermuteImm, DL, MVT::i8)); -    DCI.AddToWorklist(Res.getNode()); -    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), -                  /*AddTo*/ true); -    return true; -  } +    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT, +                                       PermuteImm)) { +      if (Depth == 1 && Root.getOpcode() == Shuffle) +        return false; // Nothing to do! +      Res = DAG.getBitcast(ShuffleVT, V1); +      DCI.AddToWorklist(Res.getNode()); +      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, +                        DAG.getConstant(PermuteImm, DL, MVT::i8)); +      DCI.AddToWorklist(Res.getNode()); +      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), +                    /*AddTo*/ true); +      return true; +    } -  if (matchBinaryVectorShuffle(MaskVT, Mask, Shuffle, ShuffleVT)) { -    if (Depth == 1 && Root.getOpcode() == Shuffle) -      return false; // Nothing to do! -    Res = DAG.getBitcast(ShuffleVT, Input); -    DCI.AddToWorklist(Res.getNode()); -    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res); -    DCI.AddToWorklist(Res.getNode()); -    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), -                  /*AddTo*/ true); -    return true; +    // TODO - this should support binary shuffles. +    if (matchBinaryVectorShuffle(MaskVT, Mask, Shuffle, ShuffleVT)) { +      if (Depth == 1 && Root.getOpcode() == Shuffle) +        return false; // Nothing to do! +      Res = DAG.getBitcast(ShuffleVT, V1); +      DCI.AddToWorklist(Res.getNode()); +      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res); +      DCI.AddToWorklist(Res.getNode()); +      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), +                    /*AddTo*/ true); +      return true; +    }    } -  SDValue V1 = Input, V2 = Input;    if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,                                        Shuffle, ShuffleVT, PermuteImm)) {      if (Depth == 1 && Root.getOpcode() == Shuffle) @@ -25237,7 +25246,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,    // If we have a single input shuffle with different shuffle patterns in the    // the 128-bit lanes use the variable mask to VPERMILPS.    // TODO Combine other mask types at higher depths. -  if (HasVariableMask && !MaskContainsZeros && +  if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&        ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||         (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {      SmallVector<SDValue, 16> VPermIdx; @@ -25249,7 +25258,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,      MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);      SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);      DCI.AddToWorklist(VPermMask.getNode()); -    Res = DAG.getBitcast(MaskVT, Input); +    Res = DAG.getBitcast(MaskVT, V1);      DCI.AddToWorklist(Res.getNode());      Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);      DCI.AddToWorklist(Res.getNode()); @@ -25263,7 +25272,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,    // Intel's manuals suggest only using PSHUFB if doing so replacing 5    // instructions, but in practice PSHUFB tends to be *very* fast so we're    // more aggressive. -  if ((Depth >= 3 || HasVariableMask) && +  if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&        ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||         (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||         (RootVT.is512BitVector() && Subtarget.hasBWI()))) { @@ -25285,7 +25294,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,        PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));      }      MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); -    Res = DAG.getBitcast(ByteVT, Input); +    Res = DAG.getBitcast(ByteVT, V1);      DCI.AddToWorklist(Res.getNode());      SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);      DCI.AddToWorklist(PSHUFBMaskOp.getNode()); @@ -25486,8 +25495,8 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,                                          HasVariableMask, DAG, DCI, Subtarget))          return true; -  // At the moment we can only combine unary shuffle mask cases. -  if (Ops.size() != 1) +  // We can only combine unary and binary shuffle mask cases. +  if (Ops.size() > 2)      return false;    // Minor canonicalization of the accumulated shuffle mask to make it easier @@ -25500,7 +25509,14 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,      Mask = std::move(WidenedMask);    } -  return combineX86ShuffleChain(Ops[0], Root, Mask, Depth, HasVariableMask, DAG, +  // Canonicalization of binary shuffle masks to improve pattern matching by +  // commuting the inputs. +  if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { +    ShuffleVectorSDNode::commuteMask(Mask); +    std::swap(Ops[0], Ops[1]); +  } + +  return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,                                  DCI, Subtarget);  } diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 27d71ca9262..616d352a75d 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -19,20 +19,11 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {  ; Verify that the DAGCombiner doesn't wrongly fold a build_vector into a  ; blend with a zero vector if the build_vector contains negative zero. -; -; TODO: the codegen for function 'test_negative_zero_1' is sub-optimal. -; Ideally, we should generate a single shuffle blend operation.  define <4 x float> @test_negative_zero_1(<4 x float> %A) {  ; CHECK-LABEL: test_negative_zero_1:  ; CHECK:       # BB#0: # %entry -; CHECK-NEXT:    movapd %xmm0, %xmm1 -; CHECK-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0] -; CHECK-NEXT:    xorps %xmm2, %xmm2 -; CHECK-NEXT:    blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero  ; CHECK-NEXT:    retq  entry:    %0 = extractelement <4 x float> %A, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index aaf5fa673a1..a1c5a97b4de 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1064,14 +1064,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {  ;  ; SSE41-LABEL: shuffle_v4f32_0zz4:  ; SSE41:       # BB#0: -; SSE41-NEXT:    insertps {{.*#+}} xmm1 = zero,zero,zero,xmm1[0] -; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: shuffle_v4f32_0zz4:  ; AVX:       # BB#0: -; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,zero,zero,xmm1[0] -; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]  ; AVX-NEXT:    retq    %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>    %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>  | 

