diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 345 |
1 files changed, 149 insertions, 196 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8ea1790e52b..a2482f26730 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7143,6 +7143,87 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { } +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDLoc DL, SDValue Cond, SDValue LHS, + SDValue RHS, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = LHS.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + // No blend instruction before SSE4.1. + if (!Subtarget->hasSSE41()) + return SDValue(); + // There is no byte-blend immediate controlled instruction. + if (EltVT == MVT::i8) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + auto *CondBV = cast<BuildVectorSDNode>(Cond); + + unsigned BlendMask = 0; + MVT BlendVT = VT; + if (VT == MVT::v16i16) { + // v16i16 blends are completely special. We can only do them when we have + // a repeated blend across the two 128-bit halves and we have AVX2. + if (!Subtarget->hasAVX2()) + return SDValue(); + + for (int i = 0; i < 8; ++i) { + SDValue Lo = CondBV->getOperand(i); + SDValue Hi = CondBV->getOperand(i + 8); + bool IsLoZero = X86::isZeroNode(Lo); + bool IsHiZero = X86::isZeroNode(Hi); + if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF && + IsLoZero != IsHiZero) + // Asymmetric blends, bail. + return SDValue(); + BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i; + } + } else { + // Everything else uses a generic blend mask computation with a custom type. + if (VT.isInteger()) { + if (VT.is256BitVector()) + // We cast to floating point types if integer blends aren't available, + // and we coerce integer blends when available to occur on the v8i32 + // type. + BlendVT = Subtarget->hasAVX2() + ? MVT::v8i32 + : MVT::getVectorVT( + MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + else + // For 128-bit vectors we do the blend on v8i16 types. + BlendVT = MVT::v8i16; + } + assert(BlendVT.getVectorNumElements() <= 8 && + "Cannot blend more than 8 elements with an immediate!"); + // Scale the blend mask based on the number of elements in the selected + // blend type. + int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements(); + for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) { + SDValue CondElement = CondBV->getOperand(i); + if (CondElement->getOpcode() != ISD::UNDEF && + X86::isZeroNode(CondElement)) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + } + } + + LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS); + RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS, + DAG.getConstant(BlendMask, MVT::i8))); +} + //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -7300,119 +7381,48 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - - unsigned BlendMask = 0; + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SmallVector<SDValue, 32> VSELECTMask; + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + SDValue TrueVal = DAG.getConstant(-1, MaskEltVT); + SDValue FalseVal = DAG.getConstant(0, MaskEltVT); for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { + if (Mask[i] < 0) { + VSELECTMask.push_back(DAG.getUNDEF(MaskEltVT)); + } else if (Mask[i] < Size) { + if (Mask[i] != i) + return SDValue(); // Shuffled V1 input! + VSELECTMask.push_back(TrueVal); + } else { if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! - BlendMask |= 1u << i; - continue; + return SDValue(); // Shuffled V2 input!; + VSELECTMask.push_back(FalseVal); } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! } - switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); - case MVT::v4i64: - case MVT::v8i32: - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - // FALLTHROUGH - case MVT::v2i64: - case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget->hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); - } - // FALLTHROUGH - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); - } - - case MVT::v16i16: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - SmallVector<int, 8> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { - // We can lower these with PBLENDW which is mirrored across 128-bit lanes. - assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); - BlendMask = 0; - for (int i = 0; i < 8; ++i) - if (RepeatedMask[i] >= 16) - BlendMask |= 1u << i; - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); - } - } - // FALLTHROUGH - case MVT::v32i8: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); - // Scale the blend by the number of bytes per element. - int Scale = VT.getScalarSizeInBits() / 8; - assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); - - // Compute the VSELECT mask. Note that VSELECT is really confusing in the - // mix of LLVM's code generator and the x86 backend. We tell the code - // generator that boolean values in the elements of an x86 vector register - // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' - // mapping a select to operand #1, and 'false' mapping to operand #2. The - // reality in x86 is that vector masks (pre-AVX-512) use only the high bit - // of the element (the remaining are ignored) and 0 in that high bit would - // mean operand #1 while 1 in the high bit would mean operand #2. So while - // the LLVM model for boolean values in vector elements gets the relevant - // bit set, it is set backwards and over constrained relative to x86's - // actual model. - SDValue VSELECTMask[32]; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - for (int j = 0; j < Scale; ++j) - VSELECTMask[Scale * i + j] = - Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); - - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); - return DAG.getNode( - ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), - V1, V2)); - } + // We have to manually attempt to lower this via BLENDI because at this phase + // of legalization we may end up legalizing the BUILD_VECTOR past where it can + // be analyzed prior to legalizing the VSELECT. + // FIXME: At some point, the legalizer should work more like the DAG combiner + // where it evaluates replacement nodes eagerly rather than risking proceeding + // to their (now shared) operands. + SDValue Cond = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, VSELECTMask); + if (SDValue BlendI = lowerVSELECTtoBLENDI(DL, Cond, V1, V2, Subtarget, DAG)) + return BlendI; - default: - llvm_unreachable("Not a supported integer vector type!"); - } + // Otherwise fall back on the generic VSELECT lowering. + return DAG.getNode(ISD::VSELECT, DL, VT, Cond, V1, V2); } /// \brief Generic routine to lower a shuffle and blend as a decomposed set of @@ -11797,91 +11807,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDValue Cond = Op.getOperand(0); - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - // No blend instruction before SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - // There is no byte-blend immediate controlled instruction. - if (EltVT == MVT::i8) - return SDValue(); - - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) - return SDValue(); - - auto *CondBV = cast<BuildVectorSDNode>(Cond); - - unsigned BlendMask = 0; - MVT BlendVT = VT; - if (VT == MVT::v16i16) { - // v16i16 blends are completely special. We can only do them when we have - // a repeated blend across the two 128-bit halves and we have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - - for (int i = 0; i < 8; ++i) { - SDValue Lo = CondBV->getOperand(i); - SDValue Hi = CondBV->getOperand(i + 8); - bool IsLoZero = X86::isZeroNode(Lo); - bool IsHiZero = X86::isZeroNode(Hi); - if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF && - IsLoZero != IsHiZero) - // Asymmetric blends, bail. - return SDValue(); - BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i; - } - } else { - // Everything else uses a generic blend mask computation with a custom type. - if (VT.isInteger()) { - if (VT.is256BitVector()) - // We cast to floating point types if integer blends aren't available, - // and we coerce integer blends when available to occur on the v8i32 - // type. - BlendVT = Subtarget->hasAVX2() - ? MVT::v8i32 - : MVT::getVectorVT( - MVT::getFloatingPointVT(VT.getScalarSizeInBits()), - VT.getVectorNumElements()); - else - // For 128-bit vectors we do the blend on v8i16 types. - BlendVT = MVT::v8i16; - } - assert(BlendVT.getVectorNumElements() <= 8 && - "Cannot blend more than 8 elements with an immediate!"); - // Scale the blend mask based on the number of elements in the selected - // blend type. - int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements(); - for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) { - SDValue CondElement = CondBV->getOperand(i); - if (CondElement->getOpcode() != ISD::UNDEF && - X86::isZeroNode(CondElement)) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - } - } - - LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS); - RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS); - - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS, - DAG.getConstant(BlendMask, MVT::i8))); -} - SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && @@ -11889,22 +11816,48 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue BlendOp = lowerVSELECTtoBLENDI(DL, Cond, LHS, RHS, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; - // Some types for vselect were previously set to Expand, not Legal or - // Custom. Return an empty SDValue so we fall-through to Expand, after - // the Custom lowering phase. + // If the condition vector type is different from the input vector types, bail + // to the TD patterns. This should only happen with vNi1 conditions. + if (Op.getSimpleValueType() != Op->getOperand(0).getSimpleValueType()) + return Op; + + // Check for types that need to be mapped in order to lower. MVT VT = Op.getSimpleValueType(); switch (VT.SimpleTy) { default: break; + case MVT::v4i64: + case MVT::v8i32: + // If we don't have AVX2 we don't want to drop to a v32i8 which will require + // splitting the vector. Instead, let the patterns for v4f64 and v8f32 lower + // these blends. + if (!Subtarget->hasAVX2()) + break; + // FALL THROUGH + + case MVT::v2i64: + case MVT::v4i32: case MVT::v8i16: case MVT::v16i16: if (Subtarget->hasBWI() && Subtarget->hasVLX()) break; - return SDValue(); + + // We need to phrase these as i8 blends. Bitcasting the condition is fine + // because true is defined as -1 which will set *all* of the bits to one. + MVT BlendVT = MVT::getVectorVT(MVT::i8, (VT.getScalarSizeInBits() / 8) * + VT.getVectorNumElements()); + Cond = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(0)); + LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(1)); + RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, Op->getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, BlendVT, Cond, LHS, RHS)); } // We couldn't create a "Blend with immediate" node. |