summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp124
1 files changed, 82 insertions, 42 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 06808a5e111..927ff2c8101 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8163,6 +8163,84 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; i++) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8468,52 +8546,14 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
}
// Otherwise fall back to a SHUFPS lowering strategy.
OpenPOWER on IntegriCloud