diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 88 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 58 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 21 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 19 |
5 files changed, 172 insertions, 46 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index da30be53fa1..eeae6724b50 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -970,6 +970,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BITREVERSE, VT, Custom); } + // Special handling for masked gather of 2 elements + if (Subtarget.hasAVX2() && !Subtarget.hasAVX512()) + setOperationAction(ISD::MGATHER, MVT::v2i64, Custom); + if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { bool HasInt256 = Subtarget.hasInt256(); @@ -24301,8 +24305,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "MGATHER/MSCATTER are supported on AVX-512 arch only"); + assert(Subtarget.hasAVX2() && + "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); SDLoc dl(Op); @@ -24316,7 +24320,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget.hasVLX() && !VT.is512BitVector() && + if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -24359,7 +24363,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SDValue RetOps[] = {Extract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2i32) { // There is a special case when the return type is v2i32 is illegal and // the type legaizer extended it to v2i64. Without this conversion we end up // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. @@ -24367,16 +24371,26 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, // with index v2i64 and value type v4i32. assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && "Unexpected type in masked gather"); - Src0 = DAG.getVectorShuffle(MVT::v4i32, dl, - DAG.getBitcast(MVT::v4i32, Src0), - DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + Src0 = + DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); // The mask should match the destination type. Extending mask with zeroes // is not necessary since instruction itself reads only two values from // memory. + SDVTList VTList; + if (Subtarget.hasVLX()) { + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + VTList = DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other); + } + else { + Mask = + DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask), + DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1}); + VTList = DAG.getVTList(MVT::v4i32, MVT::Other); + } SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other), Ops, dl, - N->getMemoryVT(), N->getMemOperand()); + VTList, Ops, dl, N->getMemoryVT(), N->getMemOperand()); SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64, NewGather.getValue(0), DAG); diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index f656b369a66..33d455739b0 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1101,3 +1101,91 @@ def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3) (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; + +// AVX2 special nodes +// masked gather of AVX2 where mask elements are i32 +def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def avx2_masked_gather_32 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// masked gather of AVX2 where mask elements are i64 +def avx2_masked_gather_64 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// dword gathers +def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + +// qwords +def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i32 || + Mgt->getBasePtr().getValueType() == MVT::v2i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64) && + Mgt->getMemoryVT().is128BitVector(); + return false; +}]>; + +def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index dc1eb3e8963..7e20951b588 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8326,36 +8326,52 @@ let Predicates = [HasAVX2, NoVLX] in { //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations -multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, +multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256> { def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX; + [(set (VTx VR128:$dst), VR128:$mask_wb, + (GatherNode128 (VTx VR128:$src1), VR128:$mask, + vectoraddr:$src2))]>, VEX; def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), (ins RC256:$src1, memop256:$src2, RC256:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX, VEX_L; -} - -let mayLoad = 1, hasSideEffects = 0, Constraints - = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" - in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>; - - let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W; - } - - let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>; + [(set (VTy RC256:$dst), RC256:$mask_wb, + (GatherNode256 (VTy RC256:$src1), RC256:$mask, + vectoraddr:$src2))]>, VEX, VEX_L; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + } } } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 56587bc45fb..0b1eddcb631 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -51,17 +51,9 @@ enum Style { } // end namespace PICStyles class X86Subtarget final : public X86GenSubtargetInfo { -protected: - enum X86SSEEnum { - NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F - }; - - enum X863DNowEnum { - NoThreeDNow, MMX, ThreeDNow, ThreeDNowA - }; - +public: enum X86ProcFamilyEnum { - Others, + Others, IntelAtom, IntelSLM, IntelGLM, @@ -74,6 +66,15 @@ protected: IntelIcelake, }; +protected: + enum X86SSEEnum { + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + }; + + enum X863DNowEnum { + NoThreeDNow, MMX, ThreeDNow, ThreeDNowA + }; + /// X86 processor family: Intel Atom, and others X86ProcFamilyEnum X86ProcFamily; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index e4505b29e6c..9328afc93e2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2368,8 +2368,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. - unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : - DL.getPointerSizeInBits(); + unsigned IndexSize = (ST->hasAVX512() && VF >= 16) + ? getIndexSizeInBits(Ptr, DL) + : DL.getPointerSizeInBits(); Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), IndexSize), VF); @@ -2385,7 +2386,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = 2; + const int GSOverhead = (Opcode == Instruction::Load) + ? ST->getGatherOverhead() + : ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); } @@ -2456,7 +2459,7 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, // the mask vector will add more instructions. Right now we give the scalar // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction // is better in the VariableMask case. - if (VF == 2 || (VF == 4 && !ST->hasVLX())) + if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) Scalarize = true; if (Scalarize) @@ -2515,11 +2518,15 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { int DataWidth = isa<PointerType>(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); + // AVX-512 and Skylake AVX2 allows gather and scatter + return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() || + ST->getProcFamily() == X86Subtarget::IntelSkylake); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + // AVX2 doesn't support scatter + if (!ST->hasAVX512()) + return false; return isLegalMaskedGather(DataType); } |