diff options
author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2017-01-02 10:37:52 +0000 |
---|---|---|
committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2017-01-02 10:37:52 +0000 |
commit | 21706cbd248804520cb093ec17d296e48a89a65b (patch) | |
tree | 044c374e508b92a6e1d3f469bcf1078e4b485ae0 /llvm/lib | |
parent | f7d84ee6ff35c6bfe950bbf451c3e84ef23c194f (diff) | |
download | bcm5719-llvm-21706cbd248804520cb093ec17d296e48a89a65b.tar.gz bcm5719-llvm-21706cbd248804520cb093ec17d296e48a89a65b.zip |
AVX-512 Loop Vectorizer: Cost calculation for interleave load/store patterns.
X86 target does not provide any target specific cost calculation for interleave patterns.It uses the common target-independent calculation, which gives very high numbers. As a result, the scalar version is chosen in many cases. The situation on AVX-512 is even worse, since we have 3-src shuffles that significantly reduce the cost.
In this patch I calculate the cost on AVX-512. It will allow to compare interleave pattern with gather/scatter and choose a better solution (PR31426).
* Shiffle-broadcast cost will be changed in Simon's upcoming patch.
Differential Revision: https://reviews.llvm.org/D28118
llvm-svn: 290810
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/CostModel.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 252 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 7 |
3 files changed, 282 insertions, 9 deletions
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index 870c446b24a..67d1773f081 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -97,6 +97,27 @@ static bool isReverseVectorMask(ArrayRef<int> Mask) { return true; } +static bool isSingleSourceVectorMask(ArrayRef<int> Mask) { + bool Vec0 = false; + bool Vec1 = false; + for (unsigned i = 0, NumVecElts = Mask.size(); i < NumVecElts; ++i) { + if (Mask[i] >= 0) { + if ((unsigned)Mask[i] >= NumVecElts) + Vec1 = true; + else + Vec0 = true; + } + } + return !(Vec0 && Vec1); +} + +static bool isZeroEltBroadcastVectorMask(ArrayRef<int> Mask) { + for (unsigned i = 0; i < Mask.size(); ++i) + if (Mask[i] > 0) + return false; + return true; +} + static bool isAlternateVectorMask(ArrayRef<int> Mask) { bool isAlternate = true; unsigned MaskSize = Mask.size(); @@ -501,6 +522,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { if (isAlternateVectorMask(Mask)) return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTypOp0, 0, nullptr); + + if (isZeroEltBroadcastVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); + + if (isSingleSourceVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTypOp0, 0, nullptr); + + return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + VecTypOp0, 0, nullptr); } return -1; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index db563c08632..5c021302d8d 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,9 +598,6 @@ int X86TTIImpl::getArithmeticInstrCost( int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -700,9 +697,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - } - if (Kind == TTI::SK_Alternate) { + } else if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -792,7 +788,132 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + + } else if (Kind == TTI::SK_PermuteTwoSrc) { + // We assume that source and destination have the same vector type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + int NumOfDests = LT.first; + int NumOfShufflesPerDest = LT.first * 2 - 1; + int NumOfShuffles = NumOfDests * NumOfShufflesPerDest; + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1 + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return NumOfShuffles * Entry->Cost; + + } else if (Kind == TTI::SK_PermuteSingleSrc) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (LT.first == 1) { + + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw + {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd + {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps + {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq + {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + } else { + // We are going to permute multiple sources and the result will be in + // multiple destinations. Providing an accurate cost only for splits where + // the element type remains the same. + + MVT LegalVT = LT.second; + if (LegalVT.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits() && + LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { + + unsigned VecTySize = DL.getTypeStoreSize(Tp); + unsigned LegalVTSize = LegalVT.getStoreSize(); + // Number of source vectors after legalization: + unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; + // Number of destination vectors after legalization: + unsigned NumOfDests = LT.first; + + Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), + LegalVT.getVectorNumElements()); + + unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; + return NumOfShuffles * + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + } + } } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); @@ -1942,13 +2063,14 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, // Vector-4 of gather/scatter instruction does not exist on KNL. // We can extend it to 8 elements, but zeroing upper bits of // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is - // better in the VariableMask case. + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction + // is better in the VariableMask case. if (VF == 2 || (VF == 4 && !ST->hasVLX())) Scalarize = true; if (Scalarize) - return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, + AddressSpace); return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } @@ -2013,3 +2135,115 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() { // As a temporary solution, disable on Atom. return !(ST->isAtom() || ST->isSLM()); } + +// Get estimation for interleaved load/store operations and strided load. +// \p Indices contains indices for strided load. +// \p Factor - the factor of interleaving. +// AVX-512 provides 3-src shuffles that significantly reduces the cost. +int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // VecTy for interleave memop is <VF*Factor x Elt>. + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = + getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + if (Opcode == Instruction::Load) { + // Kind of shuffle depends on number of loaded values. + // If we load the entire data in one register, we can use a 1-src shuffle. + // Otherwise, we'll merge 2 sources in each operation. + TTI::ShuffleKind ShuffleKind = + (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; + + unsigned ShuffleCost = + getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + + unsigned NumOfLoadsInInterleaveGrp = + Indices.size() ? Indices.size() : Factor; + Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / Factor); + unsigned NumOfResults = + getTLI()->getTypeLegalizationCost(DL, ResultTy).first * + NumOfLoadsInInterleaveGrp; + + // About a half of the loads may be folded in shuffles when we have only + // one result. If we have more than one result, we do not fold loads at all. + unsigned NumOfUnfoldedLoads = + NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + + // Get a number of shuffle operations per result. + unsigned NumOfShufflesPerResult = + std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // When we have more than one destination, we need additional instructions + // to keep sources. + unsigned NumOfMoves = 0; + if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) + NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; + + int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; + + return Cost; + } + + // Store. + assert(Opcode == Instruction::Store && + "Expected Store Instruction at this point"); + + // There is no strided stores meanwhile. And store can't be folded in + // shuffle. + unsigned NumOfSources = Factor; // The number of values to be merged. + unsigned ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + unsigned NumOfShufflesPerStore = NumOfSources - 1; + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // We need additional instructions to keep sources. + unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; + int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + NumOfMoves; + return Cost; +} + +int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) { + RequiresBW = false; + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || + EltTy->isIntegerTy(32) || EltTy->isPointerTy()) + return true; + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) { + RequiresBW = true; + return true; + } + return false; + }; + bool RequiresBW; + bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); + if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) + return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 4c256630e92..f6bcb9f569e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -80,6 +80,13 @@ public: int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace); + int getIntImmCost(int64_t); int getIntImmCost(const APInt &Imm, Type *Ty); |