1 files changed, 95 insertions, 45 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f85b00a84f8..5b0d315a6d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7488,6 +7488,81 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
 }
 
+static SDValue lowerIntegerElementInsertionVectorShuffle(
+    MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  int V2Index = std::find_if(Mask.begin(), Mask.end(),
+                             [&Mask](int M) { return M >= (int)Mask.size(); }) -
+                Mask.begin();
+
+  // Check for a single input from a SCALAR_TO_VECTOR node.
+  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+  // all the smarts here sunk into that routine. However, the current
+  // lowering of BUILD_VECTOR makes that nearly impossible until the old
+  // vector shuffle lowering is dead.
+  if ((Mask[V2Index] == (int)Mask.size() &&
+       V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+      V2.getOpcode() == ISD::BUILD_VECTOR) {
+    SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+
+    bool V1IsAllZero = false;
+    if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      V1IsAllZero = true;
+    } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+      V1IsAllZero = true;
+      for (int M : Mask) {
+        if (M < 0 || M >= (int)Mask.size())
+          continue;
+        SDValue Input = V1.getOperand(M);
+        if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
+          // A non-zero input!
+          V1IsAllZero = false;
+          break;
+        }
+      }
+    }
+    if (V1IsAllZero) {
+      // First, we need to zext the scalar if it is smaller than an i32.
+      MVT EltVT = VT.getVectorElementType();
+      assert(EltVT == V2S.getSimpleValueType() &&
+             "Different scalar and element types!");
+      MVT ExtVT = VT;
+      if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+        // Zero-extend directly to i32.
+        ExtVT = MVT::v4i32;
+        V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+      }
+
+      V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+      if (ExtVT != VT)
+        V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+
+      if (V2Index != 0) {
+        // If we have 4 or fewer lanes we can cheaply shuffle the element into
+        // the desired position. Otherwise it is more efficient to do a vector
+        // shift left. We know that we can do a vector shift left because all
+        // the inputs are zero.
+        if (VT.getVectorNumElements() <= 4) {
+          SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+          V2Shuffle[V2Index] = 0;
+          V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+        } else {
+          V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+          V2 = DAG.getNode(
+              X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+              DAG.getConstant(
+                  V2Index * EltVT.getSizeInBits(),
+                  DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+          V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+        }
+      }
+      return V2;
+    }
+  }
+  return SDValue();
+}
+
 /// \brief Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
@@ -7519,50 +7594,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
 
   // There are special ways we can lower some single-element blends.
-  if (NumV2Elements == 1) {
-    int V2Index =
-        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
-        Mask.begin();
-
-    // Check for a single input from a SCALAR_TO_VECTOR node.
-    // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
-    // all the smarts here sunk into that routine. However, the current
-    // lowering of BUILD_VECTOR makes that nearly impossible until the old
-    // vector shuffle lowering is dead.
-    if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
-        V2.getOpcode() == ISD::BUILD_VECTOR) {
-      SDValue V2S = V2.getOperand(Mask[V2Index] - 4);
-
-      bool V1IsAllZero = false;
-      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
-        V1IsAllZero = true;
-      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
-        V1IsAllZero = true;
-        for (int M : Mask) {
-          if (M < 0 || M >= 4)
-            continue;
-          SDValue Input = V1.getOperand(M);
-          if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
-            // A non-zero input!
-            V1IsAllZero = false;
-            break;
-          }
-        }
-      }
-      if (V1IsAllZero) {
-        V2 = DAG.getNode(
-            X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
-            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S));
-        if (V2Index != 0) {
-          int V2Shuffle[] = {1, 1, 1, 1};
-          V2Shuffle[V2Index] = 0;
-          V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2,
-                                    DAG.getUNDEF(MVT::v4i32), V2Shuffle);
-        }
-        return V2;
-      }
-    }
-  }
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
@@ -8210,6 +8245,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                             "to be V1-input shuffles.");
 
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Inputs == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
   if (NumV1Inputs + NumV2Inputs <= 4)
     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
 
@@ -8347,8 +8388,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
 
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
   // For single-input shuffles, there are some nicer lowering tricks we can use.
-  if (isSingleInputShuffleMask(Mask)) {
+  if (NumV2Elements == 0) {
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
@@ -8495,6 +8539,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   }
 
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
   // Check whether a compaction lowering can be done. This handles shuffles
   // which take every Nth element for some even N. See the helper function for
   // details.