summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp140
1 files changed, 95 insertions, 45 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f85b00a84f8..5b0d315a6d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7488,6 +7488,81 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
getV4X86ShuffleImm8ForMask(NewMask, DAG));
}
+static SDValue lowerIntegerElementInsertionVectorShuffle(
+ MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ int V2Index = std::find_if(Mask.begin(), Mask.end(),
+ [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
+
+ // Check for a single input from a SCALAR_TO_VECTOR node.
+ // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+ // all the smarts here sunk into that routine. However, the current
+ // lowering of BUILD_VECTOR makes that nearly impossible until the old
+ // vector shuffle lowering is dead.
+ if ((Mask[V2Index] == (int)Mask.size() &&
+ V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+ V2.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+
+ bool V1IsAllZero = false;
+ if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ V1IsAllZero = true;
+ } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+ V1IsAllZero = true;
+ for (int M : Mask) {
+ if (M < 0 || M >= (int)Mask.size())
+ continue;
+ SDValue Input = V1.getOperand(M);
+ if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
+ // A non-zero input!
+ V1IsAllZero = false;
+ break;
+ }
+ }
+ }
+ if (V1IsAllZero) {
+ // First, we need to zext the scalar if it is smaller than an i32.
+ MVT EltVT = VT.getVectorElementType();
+ assert(EltVT == V2S.getSimpleValueType() &&
+ "Different scalar and element types!");
+ MVT ExtVT = VT;
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Zero-extend directly to i32.
+ ExtVT = MVT::v4i32;
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+ if (ExtVT != VT)
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+
+ if (V2Index != 0) {
+ // If we have 4 or fewer lanes we can cheaply shuffle the element into
+ // the desired position. Otherwise it is more efficient to do a vector
+ // shift left. We know that we can do a vector shift left because all
+ // the inputs are zero.
+ if (VT.getVectorNumElements() <= 4) {
+ SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+ V2Shuffle[V2Index] = 0;
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+ } else {
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+ V2 = DAG.getNode(
+ X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+ DAG.getConstant(
+ V2Index * EltVT.getSizeInBits(),
+ DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+ }
+ }
+ return V2;
+ }
+ }
+ return SDValue();
+}
+
/// \brief Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
@@ -7519,50 +7594,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
// There are special ways we can lower some single-element blends.
- if (NumV2Elements == 1) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // Check for a single input from a SCALAR_TO_VECTOR node.
- // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
- // all the smarts here sunk into that routine. However, the current
- // lowering of BUILD_VECTOR makes that nearly impossible until the old
- // vector shuffle lowering is dead.
- if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
- V2.getOpcode() == ISD::BUILD_VECTOR) {
- SDValue V2S = V2.getOperand(Mask[V2Index] - 4);
-
- bool V1IsAllZero = false;
- if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- V1IsAllZero = true;
- } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
- V1IsAllZero = true;
- for (int M : Mask) {
- if (M < 0 || M >= 4)
- continue;
- SDValue Input = V1.getOperand(M);
- if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
- // A non-zero input!
- V1IsAllZero = false;
- break;
- }
- }
- }
- if (V1IsAllZero) {
- V2 = DAG.getNode(
- X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S));
- if (V2Index != 0) {
- int V2Shuffle[] = {1, 1, 1, 1};
- V2Shuffle[V2Index] = 0;
- V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2,
- DAG.getUNDEF(MVT::v4i32), V2Shuffle);
- }
- return V2;
- }
- }
- }
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+ MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
+ return V;
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
@@ -8210,6 +8245,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.");
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Inputs == 1)
+ if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+ MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
@@ -8347,8 +8388,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MutableArrayRef<int> LoMask = Mask.slice(0, 8);
MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
// For single-input shuffles, there are some nicer lowering tricks we can use.
- if (isSingleInputShuffleMask(Mask)) {
+ if (NumV2Elements == 0) {
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
// However, it only makes sense if the pre-duplication shuffle simplifies
@@ -8495,6 +8539,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
}
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+ MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
OpenPOWER on IntegriCloud