1 files changed, 261 insertions, 149 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a2482f26730..ed542560742 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7143,87 +7143,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
 }
 
 
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS,
-                                    SDValue RHS, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-  MVT VT = LHS.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  // No blend instruction before SSE4.1.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-  // There is no byte-blend immediate controlled instruction.
-  if (EltVT == MVT::i8)
-    return SDValue();
-
-  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
-    return SDValue();
-
-  auto *CondBV = cast<BuildVectorSDNode>(Cond);
-
-  unsigned BlendMask = 0;
-  MVT BlendVT = VT;
-  if (VT == MVT::v16i16) {
-    // v16i16 blends are completely special. We can only do them when we have
-    // a repeated blend across the two 128-bit halves and we have AVX2.
-    if (!Subtarget->hasAVX2())
-      return SDValue();
-
-    for (int i = 0; i < 8; ++i) {
-      SDValue Lo = CondBV->getOperand(i);
-      SDValue Hi = CondBV->getOperand(i + 8);
-      bool IsLoZero = X86::isZeroNode(Lo);
-      bool IsHiZero = X86::isZeroNode(Hi);
-      if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
-          IsLoZero != IsHiZero)
-        // Asymmetric blends, bail.
-        return SDValue();
-      BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
-    }
-  } else {
-    // Everything else uses a generic blend mask computation with a custom type.
-    if (VT.isInteger()) {
-      if (VT.is256BitVector())
-        // We cast to floating point types if integer blends aren't available,
-        // and we coerce integer blends when available to occur on the v8i32
-        // type.
-        BlendVT = Subtarget->hasAVX2()
-                      ? MVT::v8i32
-                      : MVT::getVectorVT(
-                            MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
-                            VT.getVectorNumElements());
-      else
-        // For 128-bit vectors we do the blend on v8i16 types.
-        BlendVT = MVT::v8i16;
-    }
-    assert(BlendVT.getVectorNumElements() <= 8 &&
-           "Cannot blend more than 8 elements with an immediate!");
-    // Scale the blend mask based on the number of elements in the selected
-    // blend type.
-    int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
-    for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
-      SDValue CondElement = CondBV->getOperand(i);
-      if (CondElement->getOpcode() != ISD::UNDEF &&
-          X86::isZeroNode(CondElement))
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-    }
-  }
-
-  LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
-  RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
-
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
-                                 DAG.getConstant(BlendMask, MVT::i8)));
-}
-
 //===----------------------------------------------------------------------===//
 // Vector shuffle lowering
 //
@@ -7381,48 +7300,119 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
                                          const X86Subtarget *Subtarget,
                                          SelectionDAG &DAG) {
-  // Compute the VSELECT mask. Note that VSELECT is really confusing in the
-  // mix of LLVM's code generator and the x86 backend. We tell the code
-  // generator that boolean values in the elements of an x86 vector register
-  // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
-  // mapping a select to operand #1, and 'false' mapping to operand #2. The
-  // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
-  // of the element (the remaining are ignored) and 0 in that high bit would
-  // mean operand #1 while 1 in the high bit would mean operand #2. So while
-  // the LLVM model for boolean values in vector elements gets the relevant
-  // bit set, it is set backwards and over constrained relative to x86's
-  // actual model.
-  SmallVector<SDValue, 32> VSELECTMask;
-  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
-  MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-  SDValue TrueVal = DAG.getConstant(-1, MaskEltVT);
-  SDValue FalseVal = DAG.getConstant(0, MaskEltVT);
+
+  unsigned BlendMask = 0;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] < 0) {
-      VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT));
-    } else if (Mask[i] < Size) {
-      if (Mask[i] != i)
-        return SDValue(); // Shuffled V1 input!
-      VSELECTMask.push_back(TrueVal);
-    } else {
+    if (Mask[i] >= Size) {
       if (Mask[i] != i + Size)
-        return SDValue(); // Shuffled V2 input!;
-      VSELECTMask.push_back(FalseVal);
+        return SDValue(); // Shuffled V2 input!
+      BlendMask |= 1u << i;
+      continue;
     }
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return SDValue(); // Shuffled V1 input!
   }
+  switch (VT.SimpleTy) {
+  case MVT::v2f64:
+  case MVT::v4f32:
+  case MVT::v4f64:
+  case MVT::v8f32:
+    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+                       DAG.getConstant(BlendMask, MVT::i8));
+
+  case MVT::v4i64:
+  case MVT::v8i32:
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    // FALLTHROUGH
+  case MVT::v2i64:
+  case MVT::v4i32:
+    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+    // that instruction.
+    if (Subtarget->hasAVX2()) {
+      // Scale the blend by the number of 32-bit dwords per element.
+      int Scale =  VT.getScalarSizeInBits() / 32;
+      BlendMask = 0;
+      for (int i = 0, Size = Mask.size(); i < Size; ++i)
+        if (Mask[i] >= Size)
+          for (int j = 0; j < Scale; ++j)
+            BlendMask |= 1u << (i * Scale + j);
+
+      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+      V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+      V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
+      return DAG.getNode(ISD::BITCAST, DL, VT,
+                         DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                                     DAG.getConstant(BlendMask, MVT::i8)));
+    }
+    // FALLTHROUGH
+  case MVT::v8i16: {
+    // For integer shuffles we need to expand the mask and cast the inputs to
+    // v8i16s prior to blending.
+    int Scale = 8 / VT.getVectorNumElements();
+    BlendMask = 0;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      if (Mask[i] >= Size)
+        for (int j = 0; j < Scale; ++j)
+          BlendMask |= 1u << (i * Scale + j);
 
-  // We have to manually attempt to lower this via BLENDI because at this phase
-  // of legalization we may end up legalizing the BUILD_VECTOR past where it can
-  // be analyzed prior to legalizing the VSELECT.
-  // FIXME: At some point, the legalizer should work more like the DAG combiner
-  // where it evaluates replacement nodes eagerly rather than risking proceeding
-  // to their (now shared) operands.
-  SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask);
-  if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG))
-    return BlendI;
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                   DAG.getConstant(BlendMask, MVT::i8)));
+  }
+
+  case MVT::v16i16: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 16)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, MVT::i8));
+    }
+  }
+    // FALLTHROUGH
+  case MVT::v32i8: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    // Scale the blend by the number of bytes per element.
+    int Scale =  VT.getScalarSizeInBits() / 8;
+    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+
+    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+    // mix of LLVM's code generator and the x86 backend. We tell the code
+    // generator that boolean values in the elements of an x86 vector register
+    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+    // mapping a select to operand #1, and 'false' mapping to operand #2. The
+    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+    // of the element (the remaining are ignored) and 0 in that high bit would
+    // mean operand #1 while 1 in the high bit would mean operand #2. So while
+    // the LLVM model for boolean values in vector elements gets the relevant
+    // bit set, it is set backwards and over constrained relative to x86's
+    // actual model.
+    SDValue VSELECTMask[32];
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        VSELECTMask[Scale * i + j] =
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
 
-  // Otherwise fall back on the generic VSELECT lowering.
-  return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2);
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+    return DAG.getNode(
+        ISD::BITCAST, DL, VT,
+        DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
+                    DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+                    V1, V2));
+  }
+
+  default:
+    llvm_unreachable("Not a supported integer vector type!");
+  }
 }
 
 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
@@ -11807,8 +11797,88 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
+/// instruction.
+static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
@@ -11816,48 +11886,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
-  SDValue Cond = Op.getOperand(0);
-  SDValue LHS = Op.getOperand(1);
-  SDValue RHS = Op.getOperand(2);
-  SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG);
+  SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
-  // If the condition vector type is different from the input vector types, bail
-  // to the TD patterns. This should only happen with vNi1 conditions.
-  if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType())
-    return Op;
-
-  // Check for types that need to be mapped in order to lower.
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
   MVT VT = Op.getSimpleValueType();
   switch (VT.SimpleTy) {
   default:
     break;
-  case MVT::v4i64:
-  case MVT::v8i32:
-    // If we don't have AVX2 we don't want to drop to a v32i8 which will require
-    // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower
-    // these blends.
-    if (!Subtarget->hasAVX2())
-      break;
-    // FALL THROUGH
-
-  case MVT::v2i64:
-  case MVT::v4i32:
   case MVT::v8i16:
   case MVT::v16i16:
     if (Subtarget->hasBWI() && Subtarget->hasVLX())
       break;
-
-    // We need to phrase these as i8 blends. Bitcasting the condition is fine
-    // because true is defined as -1 which will set *all* of the bits to one.
-    MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) *
-                                                VT.getVectorNumElements());
-    Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0));
-    LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1));
-    RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2));
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS));
+    return SDValue();
   }
 
   // We couldn't create a "Blend with immediate" node.
@@ -21635,6 +21679,57 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   return std::make_pair(Opc, NeedSplit);
 }
 
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -22184,6 +22279,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       DCI.CommitTargetLoweringOpt(TLO);
   }
 
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
   return SDValue();
 }