diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 410 |
1 files changed, 261 insertions, 149 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a2482f26730..ed542560742 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7143,87 +7143,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS, - SDValue RHS, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = LHS.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - // No blend instruction before SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - // There is no byte-blend immediate controlled instruction. - if (EltVT == MVT::i8) - return SDValue(); - - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) - return SDValue(); - - auto *CondBV = cast<BuildVectorSDNode>(Cond); - - unsigned BlendMask = 0; - MVT BlendVT = VT; - if (VT == MVT::v16i16) { - // v16i16 blends are completely special. We can only do them when we have - // a repeated blend across the two 128-bit halves and we have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - - for (int i = 0; i < 8; ++i) { - SDValue Lo = CondBV->getOperand(i); - SDValue Hi = CondBV->getOperand(i + 8); - bool IsLoZero = X86::isZeroNode(Lo); - bool IsHiZero = X86::isZeroNode(Hi); - if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF && - IsLoZero != IsHiZero) - // Asymmetric blends, bail. - return SDValue(); - BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i; - } - } else { - // Everything else uses a generic blend mask computation with a custom type. - if (VT.isInteger()) { - if (VT.is256BitVector()) - // We cast to floating point types if integer blends aren't available, - // and we coerce integer blends when available to occur on the v8i32 - // type. - BlendVT = Subtarget->hasAVX2() - ? MVT::v8i32 - : MVT::getVectorVT( - MVT::getFloatingPointVT(VT.getScalarSizeInBits()), - VT.getVectorNumElements()); - else - // For 128-bit vectors we do the blend on v8i16 types. - BlendVT = MVT::v8i16; - } - assert(BlendVT.getVectorNumElements() <= 8 && - "Cannot blend more than 8 elements with an immediate!"); - // Scale the blend mask based on the number of elements in the selected - // blend type. - int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements(); - for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) { - SDValue CondElement = CondBV->getOperand(i); - if (CondElement->getOpcode() != ISD::UNDEF && - X86::isZeroNode(CondElement)) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - } - } - - LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS); - RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS); - - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS, - DAG.getConstant(BlendMask, MVT::i8))); -} - //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -7381,48 +7300,119 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // Compute the VSELECT mask. Note that VSELECT is really confusing in the - // mix of LLVM's code generator and the x86 backend. We tell the code - // generator that boolean values in the elements of an x86 vector register - // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' - // mapping a select to operand #1, and 'false' mapping to operand #2. The - // reality in x86 is that vector masks (pre-AVX-512) use only the high bit - // of the element (the remaining are ignored) and 0 in that high bit would - // mean operand #1 while 1 in the high bit would mean operand #2. So while - // the LLVM model for boolean values in vector elements gets the relevant - // bit set, it is set backwards and over constrained relative to x86's - // actual model. - SmallVector<SDValue, 32> VSELECTMask; - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - SDValue TrueVal = DAG.getConstant(-1, MaskEltVT); - SDValue FalseVal = DAG.getConstant(0, MaskEltVT); + + unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] < 0) { - VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT)); - } else if (Mask[i] < Size) { - if (Mask[i] != i) - return SDValue(); // Shuffled V1 input! - VSELECTMask.push_back(TrueVal); - } else { + if (Mask[i] >= Size) { if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input!; - VSELECTMask.push_back(FalseVal); + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! } + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); - // We have to manually attempt to lower this via BLENDI because at this phase - // of legalization we may end up legalizing the BUILD_VECTOR past where it can - // be analyzed prior to legalizing the VSELECT. - // FIXME: At some point, the legalizer should work more like the DAG combiner - // where it evaluates replacement nodes eagerly rather than risking proceeding - // to their (now shared) operands. - SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask); - if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG)) - return BlendI; + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v32i8: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SDValue VSELECTMask[32]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); - // Otherwise fall back on the generic VSELECT lowering. - return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2); + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } } /// \brief Generic routine to lower a shuffle and blend as a decomposed set of @@ -11807,8 +11797,88 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// This function assumes its argument is a BUILD_VECTOR of constants or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa<ConstantSDNode>(EltCond)) + Lane1Cond = !isZero(EltCond); + if (isa<ConstantSDNode>(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + // Lane1Cond != 0, means we want the first argument. + // Lane1Cond == 0, means we want the second argument. + // The encoding of this argument is 0 for the first argument, 1 + // for the second. Therefore, invert the condition. + MaskValue |= !Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !Lane2Cond << i; + else + return false; + } + return true; +} + +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && @@ -11816,48 +11886,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue Cond = Op.getOperand(0); - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG); + SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; - // If the condition vector type is different from the input vector types, bail - // to the TD patterns. This should only happen with vNi1 conditions. - if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType()) - return Op; - - // Check for types that need to be mapped in order to lower. + // Some types for vselect were previously set to Expand, not Legal or + // Custom. Return an empty SDValue so we fall-through to Expand, after + // the Custom lowering phase. MVT VT = Op.getSimpleValueType(); switch (VT.SimpleTy) { default: break; - case MVT::v4i64: - case MVT::v8i32: - // If we don't have AVX2 we don't want to drop to a v32i8 which will require - // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower - // these blends. - if (!Subtarget->hasAVX2()) - break; - // FALL THROUGH - - case MVT::v2i64: - case MVT::v4i32: case MVT::v8i16: case MVT::v16i16: if (Subtarget->hasBWI() && Subtarget->hasVLX()) break; - - // We need to phrase these as i8 blends. Bitcasting the condition is fine - // because true is defined as -1 which will set *all* of the bits to one. - MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) * - VT.getVectorNumElements()); - Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0)); - LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1)); - RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2)); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS)); + return SDValue(); } // We couldn't create a "Blend with immediate" node. @@ -21635,6 +21679,57 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, return std::make_pair(Opc, NeedSplit); } +static SDValue +TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + MVT VT = N->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + SmallVector<int, 8> ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -22184,6 +22279,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, DCI.CommitTargetLoweringOpt(TLO); } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + return SDValue(); } |

