diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 124 |
1 files changed, 82 insertions, 42 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 06808a5e111..927ff2c8101 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8163,6 +8163,84 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); } +// Check for whether we can use INSERTPS to perform the shuffle. We only use +// INSERTPS when the V1 elements are already in the correct locations +// because otherwise we can just always use two SHUFPS instructions which +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also +// perform INSERTPS if a single V1 element is out of place and all V2 +// elements are zeroable. +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + unsigned ZMask = 0; + int V1DstIndex = -1; + int V2DstIndex = -1; + bool V1UsedInPlace = false; + + for (int i = 0; i < 4; i++) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } + + // Flag if we use any V1 inputs in place. + if (i == Mask[i]) { + V1UsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (V1DstIndex != -1 || V2DstIndex != -1) + return SDValue(); + + if (Mask[i] < 4) { + // V1 input out of place for insertion. + V1DstIndex = i; + } else { + // V2 input for insertion. + V2DstIndex = i; + } + } + + // Don't bother if we have no (non-zeroable) element for insertion. + if (V1DstIndex == -1 && V2DstIndex == -1) + return SDValue(); + + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned V2SrcIndex = 0; + if (V1DstIndex != -1) { + // If we have a V1 input out of place, we use V1 as the V2 element insertion + // and don't use the original V2 at all. + V2SrcIndex = Mask[V1DstIndex]; + V2DstIndex = V1DstIndex; + V2 = V1; + } else { + V2SrcIndex = Mask[V2DstIndex] - 4; + } + + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!V1UsedInPlace) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + SDLoc DL(Op); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -8468,52 +8546,14 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return V; - if (Subtarget->hasSSE41()) + if (Subtarget->hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; - // Check for whether we can use INSERTPS to perform the blend. We only use - // INSERTPS when the V1 elements are already in the correct locations - // because otherwise we can just always use two SHUFPS instructions which - // are much smaller to encode than a SHUFPS and an INSERTPS. - if (NumV2Elements == 1 && Subtarget->hasSSE41()) { - int V2Index = - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - - Mask.begin(); - - // When using INSERTPS we can zero any lane of the destination. Collect - // the zero inputs into a mask and drop them from the lanes of V1 which - // actually need to be present as inputs to the INSERTPS. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - - // Synthesize a shuffle mask for the non-zero and non-v2 inputs. - bool InsertNeedsShuffle = false; - unsigned ZMask = 0; - for (int i = 0; i < 4; ++i) - if (i != V2Index) { - if (Zeroable[i]) { - ZMask |= 1 << i; - } else if (Mask[i] != i) { - InsertNeedsShuffle = true; - break; - } - } - - // We don't want to use INSERTPS or other insertion techniques if it will - // require shuffling anyways. - if (!InsertNeedsShuffle) { - // If all of V1 is zeroable, replace it with undef. - if ((ZMask | 1 << V2Index) == 0xF) - V1 = DAG.getUNDEF(MVT::v4f32); - - unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); - - // Insert the V2 element into the desired position. - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, MVT::i8)); - } + // Use INSERTPS if we can complete the shuffle efficiently. + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + return V; } // Otherwise fall back to a SHUFPS lowering strategy. |