diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 1305 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 65 |
2 files changed, 1227 insertions, 143 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a519c200e49..9be4350ef98 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -65,6 +65,12 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +cl::opt<bool> ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(true), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + static cl::opt<int> ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments " @@ -816,6 +822,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Custom); } + if (!ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::MUL, MVT::v2i16, Custom); + setOperationAction(ISD::MUL, MVT::v2i32, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + } + setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); @@ -854,10 +866,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); + if (!ExperimentalVectorWideningLegalization) { + // Use widening instead of promotion. + for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, + MVT::v4i16, MVT::v2i16 }) { + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } + } + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Provide custom widening for v2f32 setcc. This is really for VLX when + // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to + // type legalization changing the result type to v4i1 during widening. + // It works fine for SSE2 and is probably faster so no need to qualify with + // VLX support. + if (!ExperimentalVectorWideningLegalization) + setOperationAction(ISD::SETCC, MVT::v2i32, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -877,6 +908,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + if (!ExperimentalVectorWideningLegalization) { + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } + } + for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -950,14 +996,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + if (ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + } else { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); + } // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. @@ -1024,10 +1074,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } + if (!ExperimentalVectorWideningLegalization) { + // Avoid narrow result types when widening. The legal types are listed + // in the next loop. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + } + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); + if (!ExperimentalVectorWideningLegalization) + setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1376,10 +1438,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + if (ExperimentalVectorWideningLegalization) { + // Need to custom widen this if we don't have AVX512BW. + setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1883,7 +1947,8 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; - if (VT.getVectorNumElements() != 1 && + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -18776,7 +18841,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -19110,6 +19175,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + if (!ExperimentalVectorWideningLegalization) { + // Without vector widening we need to manually construct X86 specific + // nodes and an unpcklqdq. + Lo = DAG.getNode(X86ISD::VTRUNC, DL, VT, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, DL, VT, Hi); + + // Manually concat the truncates using a shuffle. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 16> ShufMask(NumElts); + for (unsigned i = 0; i != NumElts / 2; ++i) + ShufMask[i] = i; + for (unsigned i = NumElts / 2; i != NumElts; ++i) + ShufMask[i] = i + (NumElts / 2); + return DAG.getVectorShuffle(VT, DL, Lo, Hi, ShufMask); + } + EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); @@ -21322,7 +21403,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -21469,13 +21550,14 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); - assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == - TargetLowering::TypeWidenVector && "Unexpected type action!"); + if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != + TargetLowering::TypeWidenVector) + return SDValue(); - EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); + MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), + StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); @@ -21515,10 +21597,11 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { - assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); + assert(EVT(RegVT) == MemVT && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); @@ -21537,7 +21620,179 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + + // Finally, do a normal sign-extend to the desired register. + SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT); + return DAG.getMergeValues({SExt, Load.getValue(1)}, dl); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegSize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz >= 256) + loadRegSize = 128; + + // If we don't have BWI we won't be able to create the shuffle needed for + // v8i8->v8i64. + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) + loadRegSize = 128; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegSize / MemVT.getScalarSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8; + SDValue Increment = DAG.getConstant(OffsetInc, dl, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + unsigned Offset = 0; + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset); + + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo().getWithOffset(Offset), + NewAlign, Ld->getMemOperand()->getFlags()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i, dl)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Offset += OffsetInc; + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); + return DAG.getMergeValues({Sext, TF}, dl); + } + + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) { + SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); + return DAG.getMergeValues({Sext, TF}, dl); + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + + // Bitcast to the requested type. + Shuff = DAG.getBitcast(RegVT, Shuff); + return DAG.getMergeValues({Shuff, TF}, dl); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes @@ -26910,13 +27165,12 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); - if (VT == MVT::v2f32 || VT == MVT::v2i32) { + if (VT == MVT::v2f32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( @@ -26926,6 +27180,30 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } + if (VT == MVT::v2i32) { + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(MVT::v2i32)); + // If the index is v2i64 and we have VLX we can use xmm for data and index. + if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { + SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( + VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); + return SDValue(NewScatter.getNode(), 1); + } + // Custom widen all the operands to avoid promotion. + EVT NewIndexVT = EVT::getVectorVT( + *DAG.getContext(), Index.getValueType().getVectorElementType(), 4); + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(Index.getValueType())); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl, + Ops, N->getMemOperand(), N->getIndexType()); + } + MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); @@ -27348,22 +27626,37 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::MUL: { EVT VT = N->getValueType(0); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"); - // Pre-promote these to vXi16 to avoid op legalization thinking all 16 - // elements are needed. - MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); - SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); - SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - unsigned NumConcats = 16 / VT.getVectorNumElements(); - SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); - ConcatOps[0] = Res; - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); - Results.push_back(Res); + assert(VT.isVector() && "Unexpected VT"); + if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && + VT.getVectorNumElements() == 2) { + // Promote to a pattern that will be turned into PMULUDQ. + SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, + N->getOperand(1)); + SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); + } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT.getVectorElementType() == MVT::i8) { + // Pre-promote these to vXi16 to avoid op legalization thinking all 16 + // elements are needed. + MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + unsigned NumConcats = 16 / VT.getVectorNumElements(); + SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); + Results.push_back(Res); + } return; } + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and @@ -27374,8 +27667,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), @@ -27392,6 +27683,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } @@ -27420,6 +27714,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Hi); return; } + case ISD::SETCC: { + // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when + // setCC result type is v2i1 because type legalzation will end up with + // a v4i1 setcc plus an extend. + assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); + if (N->getOperand(0).getValueType() != MVT::v2f32 || + getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) + return; + SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, + N->getOperand(2)); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -27440,9 +27754,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); - if (VT.isVector()) { - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? @@ -27460,6 +27772,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (VT == MVT::v2i32) { + // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the + // v2i64 and unroll later. But then we create i64 scalar ops which + // might be slow in 64-bit mode or require a libcall in 32-bit mode. + Results.push_back(DAG.UnrollVectorOp(N)); + return; + } + + if (VT.isVector()) + return; + LLVM_FALLTHROUGH; } case ISD::SDIVREM: @@ -27470,8 +27793,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::TRUNCATE: { MVT VT = N->getSimpleValueType(0); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; // The generic legalizer will try to widen the input type to the same // number of elements as the widened result type. But this isn't always @@ -27519,15 +27842,56 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::SIGN_EXTEND_VECTOR_INREG: { + if (ExperimentalVectorWideningLegalization) + return; + + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && + (InVT == MVT::v16i16 || InVT == MVT::v32i8)) { + // Custom split this so we can extend i8/i16->i32 invec. This is better + // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using + // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting + // we allow the sra from the extend to i32 to be shared by the split. + EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), + InVT.getVectorNumElements() / 2); + MVT ExtendVT = MVT::getVectorVT(MVT::i32, + VT.getVectorNumElements()); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT, + In, DAG.getIntPtrConstant(0, dl)); + In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In); + + // Fill a vector with sign bits for each element. + SDValue Zero = DAG.getConstant(0, dl, ExtendVT); + SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + // Create an unpackl and unpackh to interleave the sign bits then bitcast + // to vXi64. + SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits); + Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); + SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits); + Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); + + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } + return; + } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ - assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && - "Unexpected type action!"); + (InVT == MVT::v4i16 || InVT == MVT::v4i8) && + getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using @@ -27598,9 +27962,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); + // Promote these manually to avoid over promotion to v2i64. Type + // legalization will revisit the v2i32 operation for more cleanup. + if ((VT == MVT::v2i8 || VT == MVT::v2i16) && + getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { + // AVX512DQ provides instructions that produce a v2i64 result. + if (Subtarget.hasDQI()) + return; + + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src); + Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext + : ISD::AssertSext, + dl, MVT::v2i32, Res, + DAG.getValueType(VT.getVectorElementType())); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + Results.push_back(Res); + return; + } + if (VT.isVector() && VT.getScalarSizeInBits() < 32) { - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); @@ -27635,18 +28017,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + bool Widenv2i32 = + getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; if (Src.getValueType() == MVT::v2f64) { + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { - // If we have VLX we can emit a target specific FP_TO_UINT node, - // otherwise we can defer to the generic legalizer which will widen - // the input as well. This will be further widened during op - // legalization to v8i32<-v8f64. - return; + // If v2i32 is widened, we can defer to the generic legalizer. + if (Widenv2i32) + return; + // Custom widen by doubling to a legal vector with. Isel will + // further widen to v8f64. + Opc = ISD::FP_TO_UINT; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, + Src, DAG.getUNDEF(MVT::v2f64)); } - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); + if (!Widenv2i32) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + return; + } + if (SrcVT == MVT::v2f32 && + getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT + : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -27656,8 +28055,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - assert(!VT.isVector() && "Vectors should have been handled above!"); - if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); @@ -27942,33 +28339,42 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (DstVT.isVector() && SrcVT == MVT::x86mmx) { - assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && - "Unexpected type action!"); + if (DstVT.isVector() && SrcVT == MVT::x86mmx && + getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) { EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0)); Results.push_back(Res); return; } + if (SrcVT != MVT::f64 || + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || + getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) + return; + + unsigned NumElts = DstVT.getVectorNumElements(); + EVT SVT = DstVT.getVectorElementType(); + EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue Res; + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); + Res = DAG.getBitcast(WiderVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); - if ((VT == MVT::v2f32 || VT == MVT::v2i32) && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast<MaskedGatherSDNode>(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); - EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Gather->getPassThru(), - DAG.getUNDEF(VT)); + DAG.getUNDEF(MVT::v2f32)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. @@ -27979,12 +28385,67 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl, + DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } + if (VT == MVT::v2i32) { + auto *Gather = cast<MaskedGatherSDNode>(N); + SDValue Index = Gather->getIndex(); + SDValue Mask = Gather->getMask(); + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, + Gather->getPassThru(), + DAG.getUNDEF(MVT::v2i32)); + // If the index is v2i64 we can use it directly. + if (Index.getValueType() == MVT::v2i64 && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); + } + SDValue Ops[] = { Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); + SDValue Chain = Res.getValue(2); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + EVT IndexVT = Index.getValueType(); + EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getScalarType(), 4); + // Otherwise we need to custom widen everything to avoid promotion. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(IndexVT)); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = { Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), + Gather->getMemoryVT(), dl, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + SDValue Chain = Res.getValue(1); + if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + } return; } case ISD::LOAD: { @@ -27993,8 +28454,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); @@ -28004,10 +28465,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - MVT VecVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); - EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); - Res = DAG.getBitcast(WideVT, Res); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; @@ -33545,6 +34007,67 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return HAddSub; } + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!ExperimentalVectorWideningLegalization && + !DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + // isOperationLegal lies for integer ops on floating point types. + CanFold = VT.isInteger(); + break; + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + // isOperationLegal lies for floating point ops on integer types. + CanFold = VT.isFloatingPoint(); + break; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); + } + } + } + // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) return LD; @@ -33640,6 +34163,54 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the + // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. + // FIXME: This can probably go away once we default to widening legalization. + if (!ExperimentalVectorWideningLegalization && + Subtarget.hasSSE41() && VT == MVT::v4i32 && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { + SDValue BC = N->getOperand(0); + SDValue MULUDQ = BC.getOperand(0); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + ArrayRef<int> Mask = SVOp->getMask(); + if (BC.hasOneUse() && MULUDQ.hasOneUse() && + Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { + SDValue Op0 = MULUDQ.getOperand(0); + SDValue Op1 = MULUDQ.getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op0.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp0 = + cast<ShuffleVectorSDNode>(Op0.getOperand(0)); + ArrayRef<int> Mask2 = SVOp0->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = SVOp0->getOperand(0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + if (Op1.getOpcode() == ISD::BITCAST && + Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op1.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp1 = + cast<ShuffleVectorSDNode>(Op1.getOperand(0)); + ArrayRef<int> Mask2 = SVOp1->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); + Op1 = SVOp1->getOperand(0); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + } + } + return SDValue(); } @@ -35532,7 +36103,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, SDLoc DL(ExtElt); - if (VecVT == MVT::v8i8) { + if (ExperimentalVectorWideningLegalization && VecVT == MVT::v8i8) { // Pad with undef. Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, DAG.getUNDEF(VecVT)); @@ -36229,6 +36800,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); @@ -37104,45 +37677,98 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, if ((NumElts % 2) != 0) return SDValue(); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the - // lower part is needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, - DL, VT, MulLo); + if (ExperimentalVectorWideningLegalization || + NumElts >= OpsVT.getVectorNumElements()) { + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); + + MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector<int, 16> ShuffleMask(NumElts); + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + NumElts; + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResLo = DAG.getBitcast(ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i + NumElts / 2; + ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResHi = DAG.getBitcast(ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + + // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want + // to legalize the mul explicitly because implicit legalization for type + // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack + // instructions which will not exist when we explicitly legalize it by + // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with + // <4 x i16> undef). + // + // Legalize the operands of mul. + // FIXME: We may be able to handle non-concatenated vectors by insertion. + unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); + if ((RegSize % ReducedSizeInBits) != 0) + return SDValue(); + + SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits, + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (Mode == MULU8 || Mode == MULS8) { + // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower + // part is needed. + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // convert the type of mul result to VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } - MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); - // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, - // the higher part is also needed. + // Generate the lower and higher part of mul: pmulhw/pmulhuw. For + // MULU16/MULS16, both parts are needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); + OpsVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider - // result. - // Generate shuffle functioning as punpcklwd. - SmallVector<int, 16> ShuffleMask(NumElts); - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i; - ShuffleMask[2 * i + 1] = i + NumElts; - } - SDValue ResLo = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResLo = DAG.getBitcast(ResVT, ResLo); - // Generate shuffle functioning as punpckhwd. - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i + NumElts / 2; - ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; - } - SDValue ResHi = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResHi = DAG.getBitcast(ResVT, ResHi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + // result. Make sure the type of mul result is VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); + Res = DAG.getBitcast(ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, @@ -37250,7 +37876,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || + DAG.getTargetLoweringInfo().isTypeLegal(WVT))) return SDValue(); SDValue N0 = N->getOperand(0); @@ -39506,7 +40133,93 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + if (Mld->getExtensionType() != ISD::EXTLOAD) + return SDValue(); + + // Resolve extending loads. + EVT VT = Mld->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getScalarSizeInBits(); + unsigned FromSz = LdVT.getScalarSizeInBits(); + // From/To sizes and ElemCount must be pow of two. + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert PassThru value. + SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru()); + if (!Mld->getPassThru().isUndef()) { + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru, + DAG.getUNDEF(WideVecVT), ShuffleVec); + } + + // Prepare the new mask. + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type. + NewMask = DAG.getBitcast(WideVecVT, Mask); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + ShuffleVec); + } else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); + Ops[0] = Mask; + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WidePassThru, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -39544,34 +40257,111 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (Mst->isTruncatingStore()) + if (!Mst->isTruncatingStore()) { + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; + + // If the mask value has been legalized to a non-boolean vector, try to + // simplify ops leading up to it. We only demand the MSB of each lane. + SDValue Mask = Mst->getMask(); + if (Mask.getScalarValueSizeInBits() != 1) { + APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } + + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + + return SDValue(); + } + + if (ExperimentalVectorWideningLegalization) return SDValue(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) - return ScalarStore; + // Resolve truncating stores. + unsigned NumElems = VT.getVectorNumElements(); - // If the mask value has been legalized to a non-boolean vector, try to - // simplify ops leading up to it. We only demand the MSB of each lane. + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + + // From/To sizes and ElemCount must be pow of two. + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); + + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + ShuffleVec); + + SDValue NewMask; SDValue Mask = Mst->getMask(); - if (Mask.getScalarValueSizeInBits() != 1) { - APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) - return SDValue(N, 0); - } + if (Mask.getValueType() == VT) { + // Mask and original value have the same type. + NewMask = DAG.getBitcast(WideVecVT, Mask); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + ShuffleVec); + } else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); - SDValue Value = Mst->getValue(); - if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); + unsigned NumConcat = WidenNumElts / MaskNumElts; + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); + Ops[0] = Mask; + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return SDValue(); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -39699,6 +40489,41 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, MVT::v16i8, St->getMemOperand()); } + // Look for a truncating store to a less than 128 bit vector that has been + // truncated from an any_extend_inreg from a 128 bit vector with the same + // element size. We can use a 64/32/16-bit extractelement and store that. + // Disabling this when widening legalization is in effect since the trunc + // store would have been unlikely to be created in that case. Only doing this + // when truncstore is legal since it would otherwise be decomposed below and + // then combined away. + if (St->isTruncatingStore() && TLI.isTruncStoreLegal(VT, StVT) && + StoredVal.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG && + StoredVal.getValueType().is128BitVector() && + !ExperimentalVectorWideningLegalization) { + EVT OrigVT = StoredVal.getOperand(0).getValueType(); + if (OrigVT.is128BitVector() && + OrigVT.getVectorElementType() == StVT.getVectorElementType()) { + unsigned StoreSize = StVT.getSizeInBits(); + assert((128 % StoreSize == 0) && "Unexpected store size!"); + MVT IntVT = MVT::getIntegerVT(StoreSize); + MVT CastVT = MVT::getVectorVT(IntVT, 128 / StoreSize); + StoredVal = DAG.getBitcast(CastVT, StoredVal.getOperand(0)); + // Use extract_store for the 64-bit case to support 32-bit targets. + if (IntVT == MVT::i64) { + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, + IntVT, St->getMemOperand()); + } + + // Otherwise just use an extract and store. + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, IntVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getMemOperand()); + } + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. @@ -39725,7 +40550,85 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemoryVT(), St->getMemOperand(), DAG); } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) + return SDValue(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromSz) % ToSz) return SDValue(); + + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + ShuffleVec); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) + StoreType = Tp; + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i, dl)); + SDValue Ch = + DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -39741,7 +40644,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) && + if (((VT.isVector() && !VT.isFloatingPoint() && + !ExperimentalVectorWideningLegalization) || + (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa<LoadSDNode>(St->getValue()) && !cast<LoadSDNode>(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -40256,7 +41161,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // Only handle vXi16 types that are at least 128-bits unless they will be // widened. - if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) + if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || + (!ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() < 8)) return SDValue(); // Input type should be vXi32. @@ -41320,6 +42227,127 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } +/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or +/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating +/// with UNDEFs) of the input to vectors of the same size as the target type +/// which then extends the lowest elements. +static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + unsigned Opcode = N->getOpcode(); + // TODO - add ANY_EXTEND support. + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) + return SDValue(); + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + if (!Subtarget.hasSSE2()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT InVT = N0.getValueType(); + EVT InSVT = InVT.getScalarType(); + + // FIXME: Generic DAGCombiner previously had a bug that would cause a + // sign_extend of setcc to sometimes return the original node and tricked it + // into thinking CombineTo was used which prevented the target combines from + // running. + // Earlying out here to avoid regressions like this + // (v4i32 (sext (v4i1 (setcc (v4i16))))) + // Becomes + // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) + // Type legalized to + // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) + // Leading to a packssdw+pmovsxwd + // We could write a DAG combine to fix this, but really we shouldn't be + // creating sext_invec that's forcing v8i16 into the DAG. + if (N0.getOpcode() == ISD::SETCC) + return SDValue(); + + // Input type must be a vector and we must be extending legal integer types. + if (!VT.isVector() || VT.getVectorNumElements() < 2) + return SDValue(); + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + // If the input/output types are both legal then we have at least AVX1 and + // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly. + if (DAG.getTargetLoweringInfo().isTypeLegal(VT) && + DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + + SDLoc DL(N); + + auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { + EVT SrcVT = N.getValueType(); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + Size / SrcVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(), + DAG.getUNDEF(SrcVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); + }; + + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + + // If target-size is 128-bits (or 256-bits on AVX target), then convert to + // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. + // Also use this if we don't have SSE41 to allow the legalizer do its job. + if (!Subtarget.hasSSE41() || VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.hasAVX()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs())) { + SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); + return DAG.getNode(Opcode, DL, VT, ExOp); + } + + auto SplitAndExtendInReg = [&](unsigned SplitSize) { + unsigned NumVecs = VT.getSizeInBits() / SplitSize; + unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); + SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + }; + + // On pre-AVX targets, split into 128-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) + return SplitAndExtendInReg(128); + + // On pre-AVX512 targets, split into 256-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) + return SplitAndExtendInReg(256); + + return SDValue(); +} + // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, @@ -41390,6 +42418,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41562,6 +42593,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41738,6 +42772,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, @@ -43596,6 +44632,15 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, } } + // Combine (ext_invec (ext_invec X)) -> (ext_invec X) + // Disabling for widening legalization for now. We can enable if we find a + // case that needs it. Otherwise it can be deleted when we switch to + // widening legalization. + if (!ExperimentalVectorWideningLegalization && + In.getOpcode() == N->getOpcode() && + TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + // Attempt to combine as a shuffle. // TODO: SSE41 support if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0ff08776c07..1181181a3c5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -50,6 +50,8 @@ using namespace llvm; #define DEBUG_TYPE "x86tti" +extern cl::opt<bool> ExperimentalVectorWideningLegalization; + //===----------------------------------------------------------------------===// // // X86 cost model. @@ -918,7 +920,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // FIXME: We can use permq for 64-bit or larger extracts from 256-bit // vectors. int OrigSubElts = SubTp->getVectorNumElements(); - if (NumSubElts > OrigSubElts && + if (ExperimentalVectorWideningLegalization && + NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && LT.second.getVectorElementType() == SubLT.second.getVectorElementType() && @@ -1330,6 +1333,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and // 256-bit wide vectors. + // Used with widening legalization + static const TypeConversionCostTblEntry AVX512FConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + }; + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, @@ -1347,8 +1356,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1401,19 +1408,28 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, }; + static const TypeConversionCostTblEntry AVX2ConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + }; + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1432,18 +1448,24 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; + static const TypeConversionCostTblEntry AVXConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, + }; + static const TypeConversionCostTblEntry AVXConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, @@ -1642,18 +1664,35 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; + if (ST->hasAVX512() && ExperimentalVectorWideningLegalization) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + if (ST->hasAVX512()) if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } + if (ST->hasAVX2() && ExperimentalVectorWideningLegalization) { + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + } + if (ST->hasAVX2()) { if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } + if (ST->hasAVX() && ExperimentalVectorWideningLegalization) { + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + } + if (ST->hasAVX()) { if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) @@ -2520,7 +2559,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, // in the table. // FIXME: Is there a better way to do this? EVT VT = TLI->getValueType(DL, ValTy); - if (VT.isSimple()) { + if (VT.isSimple() && ExperimentalVectorWideningLegalization) { MVT MTy = VT.getSimpleVT(); if (IsPairwise) { if (ST->hasAVX()) |