summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp410
1 files changed, 261 insertions, 149 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a2482f26730..ed542560742 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7143,87 +7143,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
}
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS,
- SDValue RHS, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = LHS.getSimpleValueType();
- MVT EltVT = VT.getVectorElementType();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- // No blend instruction before SSE4.1.
- if (!Subtarget->hasSSE41())
- return SDValue();
- // There is no byte-blend immediate controlled instruction.
- if (EltVT == MVT::i8)
- return SDValue();
-
- if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
- return SDValue();
-
- auto *CondBV = cast<BuildVectorSDNode>(Cond);
-
- unsigned BlendMask = 0;
- MVT BlendVT = VT;
- if (VT == MVT::v16i16) {
- // v16i16 blends are completely special. We can only do them when we have
- // a repeated blend across the two 128-bit halves and we have AVX2.
- if (!Subtarget->hasAVX2())
- return SDValue();
-
- for (int i = 0; i < 8; ++i) {
- SDValue Lo = CondBV->getOperand(i);
- SDValue Hi = CondBV->getOperand(i + 8);
- bool IsLoZero = X86::isZeroNode(Lo);
- bool IsHiZero = X86::isZeroNode(Hi);
- if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
- IsLoZero != IsHiZero)
- // Asymmetric blends, bail.
- return SDValue();
- BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
- }
- } else {
- // Everything else uses a generic blend mask computation with a custom type.
- if (VT.isInteger()) {
- if (VT.is256BitVector())
- // We cast to floating point types if integer blends aren't available,
- // and we coerce integer blends when available to occur on the v8i32
- // type.
- BlendVT = Subtarget->hasAVX2()
- ? MVT::v8i32
- : MVT::getVectorVT(
- MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
- VT.getVectorNumElements());
- else
- // For 128-bit vectors we do the blend on v8i16 types.
- BlendVT = MVT::v8i16;
- }
- assert(BlendVT.getVectorNumElements() <= 8 &&
- "Cannot blend more than 8 elements with an immediate!");
- // Scale the blend mask based on the number of elements in the selected
- // blend type.
- int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
- for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
- SDValue CondElement = CondBV->getOperand(i);
- if (CondElement->getOpcode() != ISD::UNDEF &&
- X86::isZeroNode(CondElement))
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
- }
- }
-
- LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
- RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
-
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
- DAG.getConstant(BlendMask, MVT::i8)));
-}
-
//===----------------------------------------------------------------------===//
// Vector shuffle lowering
//
@@ -7381,48 +7300,119 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- // Compute the VSELECT mask. Note that VSELECT is really confusing in the
- // mix of LLVM's code generator and the x86 backend. We tell the code
- // generator that boolean values in the elements of an x86 vector register
- // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
- // mapping a select to operand #1, and 'false' mapping to operand #2. The
- // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
- // of the element (the remaining are ignored) and 0 in that high bit would
- // mean operand #1 while 1 in the high bit would mean operand #2. So while
- // the LLVM model for boolean values in vector elements gets the relevant
- // bit set, it is set backwards and over constrained relative to x86's
- // actual model.
- SmallVector<SDValue, 32> VSELECTMask;
- MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
- SDValue TrueVal = DAG.getConstant(-1, MaskEltVT);
- SDValue FalseVal = DAG.getConstant(0, MaskEltVT);
+
+ unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] < 0) {
- VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT));
- } else if (Mask[i] < Size) {
- if (Mask[i] != i)
- return SDValue(); // Shuffled V1 input!
- VSELECTMask.push_back(TrueVal);
- } else {
+ if (Mask[i] >= Size) {
if (Mask[i] != i + Size)
- return SDValue(); // Shuffled V2 input!;
- VSELECTMask.push_back(FalseVal);
+ return SDValue(); // Shuffled V2 input!
+ BlendMask |= 1u << i;
+ continue;
}
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return SDValue(); // Shuffled V1 input!
}
+ switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ // FALLTHROUGH
+ case MVT::v2i64:
+ case MVT::v4i32:
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget->hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
+ }
+ // FALLTHROUGH
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
- // We have to manually attempt to lower this via BLENDI because at this phase
- // of legalization we may end up legalizing the BUILD_VECTOR past where it can
- // be analyzed prior to legalizing the VSELECT.
- // FIXME: At some point, the legalizer should work more like the DAG combiner
- // where it evaluates replacement nodes eagerly rather than risking proceeding
- // to their (now shared) operands.
- SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask);
- if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG))
- return BlendI;
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
+ }
+
+ case MVT::v16i16: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 16)
+ BlendMask |= 1u << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ }
+ }
+ // FALLTHROUGH
+ case MVT::v32i8: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+ assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+
+ // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+ // mix of LLVM's code generator and the x86 backend. We tell the code
+ // generator that boolean values in the elements of an x86 vector register
+ // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+ // mapping a select to operand #1, and 'false' mapping to operand #2. The
+ // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+ // of the element (the remaining are ignored) and 0 in that high bit would
+ // mean operand #1 while 1 in the high bit would mean operand #2. So while
+ // the LLVM model for boolean values in vector elements gets the relevant
+ // bit set, it is set backwards and over constrained relative to x86's
+ // actual model.
+ SDValue VSELECTMask[32];
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int j = 0; j < Scale; ++j)
+ VSELECTMask[Scale * i + j] =
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
- // Otherwise fall back on the generic VSELECT lowering.
- return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2);
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+ V1, V2));
+ }
+
+ default:
+ llvm_unreachable("Not a supported integer vector type!");
+ }
}
/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
@@ -11807,8 +11797,88 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+ unsigned &MaskValue) {
+ MaskValue = 0;
+ unsigned NumElems = BuildVector->getNumOperands();
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+ SDValue EltCond = BuildVector->getOperand(i);
+ SDValue SndLaneEltCond =
+ (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+ int Lane1Cond = -1, Lane2Cond = -1;
+ if (isa<ConstantSDNode>(EltCond))
+ Lane1Cond = !isZero(EltCond);
+ if (isa<ConstantSDNode>(SndLaneEltCond))
+ Lane2Cond = !isZero(SndLaneEltCond);
+
+ if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+ // Lane1Cond != 0, means we want the first argument.
+ // Lane1Cond == 0, means we want the second argument.
+ // The encoding of this argument is 0 for the first argument, 1
+ // for the second. Therefore, invert the condition.
+ MaskValue |= !Lane1Cond << i;
+ else if (Lane1Cond < 0)
+ MaskValue |= !Lane2Cond << i;
+ else
+ return false;
+ }
+ return true;
+}
+
+/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
+/// instruction.
+static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // Check the mask for BLEND and build the value.
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+ MVT BlendVT = VT;
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ }
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
@@ -11816,48 +11886,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
- SDValue Cond = Op.getOperand(0);
- SDValue LHS = Op.getOperand(1);
- SDValue RHS = Op.getOperand(2);
- SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG);
+ SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
- // If the condition vector type is different from the input vector types, bail
- // to the TD patterns. This should only happen with vNi1 conditions.
- if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType())
- return Op;
-
- // Check for types that need to be mapped in order to lower.
+ // Some types for vselect were previously set to Expand, not Legal or
+ // Custom. Return an empty SDValue so we fall-through to Expand, after
+ // the Custom lowering phase.
MVT VT = Op.getSimpleValueType();
switch (VT.SimpleTy) {
default:
break;
- case MVT::v4i64:
- case MVT::v8i32:
- // If we don't have AVX2 we don't want to drop to a v32i8 which will require
- // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower
- // these blends.
- if (!Subtarget->hasAVX2())
- break;
- // FALL THROUGH
-
- case MVT::v2i64:
- case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i16:
if (Subtarget->hasBWI() && Subtarget->hasVLX())
break;
-
- // We need to phrase these as i8 blends. Bitcasting the condition is fine
- // because true is defined as -1 which will set *all* of the bits to one.
- MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) *
- VT.getVectorNumElements());
- Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0));
- LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1));
- RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2));
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS));
+ return SDValue();
}
// We couldn't create a "Blend with immediate" node.
@@ -21635,6 +21679,57 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
return std::make_pair(Opc, NeedSplit);
}
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue CondSrc = Cond->getOperand(0);
+ if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+ Cond = CondSrc->getOperand(0);
+ }
+
+ MVT VT = N->getSimpleValueType(0);
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+ return SDValue();
+
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ SmallVector<int, 8> ShuffleMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ // Be sure we emit undef where we can.
+ if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+ ShuffleMask[i] = -1;
+ else
+ ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+ }
+
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -22184,6 +22279,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
DCI.CommitTargetLoweringOpt(TLO);
}
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
return SDValue();
}
OpenPOWER on IntegriCloud