diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-09 21:47:55 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-09 21:47:55 +0000 |
| commit | 606126e848b256fb9d0a91371b15e5ec024a63f8 (patch) | |
| tree | 1bba188a65a70bf44414ac7003b856d89d2759e6 /llvm/lib | |
| parent | bcb8190f9947c0fb573dce679447bdb05b4e4ae3 (diff) | |
| download | bcm5719-llvm-606126e848b256fb9d0a91371b15e5ec024a63f8.tar.gz bcm5719-llvm-606126e848b256fb9d0a91371b15e5ec024a63f8.zip | |
[X86][SSE] Add support for target shuffle combining to INSERTPS
llvm-svn: 274990
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 58 |
1 files changed, 48 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e3a85a70c6..83e910d4f14 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8677,16 +8677,14 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; @@ -8707,7 +8705,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, // We can only insert a single non-zeroable element. if (V1DstIndex >= 0 || V2DstIndex >= 0) - return SDValue(); + return false; if (Mask[i] < 4) { // V1 input out of place for insertion. @@ -8720,7 +8718,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, // Don't bother if we have no (non-zeroable) element for insertion. if (V1DstIndex < 0 && V2DstIndex < 0) - return SDValue(); + return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. @@ -8740,8 +8738,21 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); - unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + // Insert the V2 element into the desired position. + InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + return true; +} + +static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + // Attempt to match the insertps pattern. + unsigned InsertPSMask; + if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, @@ -25081,6 +25092,33 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, } } + // Attempt to combine to INSERTPS. + if (Subtarget.hasSSE41() && NumMaskElts == 4 && + (VT == MVT::v2f64 || VT == MVT::v4f32)) { + SmallBitVector Zeroable(4, false); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] < 0) + Zeroable[i] = true; + + unsigned InsertPSMask; + SDValue V1 = Input, V2 = Input; + if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, + Zeroable, Mask, DAG)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) + return false; // Nothing to do! + V1 = DAG.getBitcast(MVT::v4f32, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MVT::v4f32, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) |

