summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2017-01-02 10:37:52 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2017-01-02 10:37:52 +0000
commit21706cbd248804520cb093ec17d296e48a89a65b (patch)
tree044c374e508b92a6e1d3f469bcf1078e4b485ae0 /llvm/lib
parentf7d84ee6ff35c6bfe950bbf451c3e84ef23c194f (diff)
downloadbcm5719-llvm-21706cbd248804520cb093ec17d296e48a89a65b.tar.gz
bcm5719-llvm-21706cbd248804520cb093ec17d296e48a89a65b.zip
AVX-512 Loop Vectorizer: Cost calculation for interleave load/store patterns.
X86 target does not provide any target specific cost calculation for interleave patterns.It uses the common target-independent calculation, which gives very high numbers. As a result, the scalar version is chosen in many cases. The situation on AVX-512 is even worse, since we have 3-src shuffles that significantly reduce the cost. In this patch I calculate the cost on AVX-512. It will allow to compare interleave pattern with gather/scatter and choose a better solution (PR31426). * Shiffle-broadcast cost will be changed in Simon's upcoming patch. Differential Revision: https://reviews.llvm.org/D28118 llvm-svn: 290810
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CostModel.cpp32
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp252
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h7
3 files changed, 282 insertions, 9 deletions
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 870c446b24a..67d1773f081 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -97,6 +97,27 @@ static bool isReverseVectorMask(ArrayRef<int> Mask) {
return true;
}
+static bool isSingleSourceVectorMask(ArrayRef<int> Mask) {
+ bool Vec0 = false;
+ bool Vec1 = false;
+ for (unsigned i = 0, NumVecElts = Mask.size(); i < NumVecElts; ++i) {
+ if (Mask[i] >= 0) {
+ if ((unsigned)Mask[i] >= NumVecElts)
+ Vec1 = true;
+ else
+ Vec0 = true;
+ }
+ }
+ return !(Vec0 && Vec1);
+}
+
+static bool isZeroEltBroadcastVectorMask(ArrayRef<int> Mask) {
+ for (unsigned i = 0; i < Mask.size(); ++i)
+ if (Mask[i] > 0)
+ return false;
+ return true;
+}
+
static bool isAlternateVectorMask(ArrayRef<int> Mask) {
bool isAlternate = true;
unsigned MaskSize = Mask.size();
@@ -501,6 +522,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
if (isAlternateVectorMask(Mask))
return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate,
VecTypOp0, 0, nullptr);
+
+ if (isZeroEltBroadcastVectorMask(Mask))
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast,
+ VecTypOp0, 0, nullptr);
+
+ if (isSingleSourceVectorMask(Mask))
+ return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ VecTypOp0, 0, nullptr);
+
+ return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ VecTypOp0, 0, nullptr);
}
return -1;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index db563c08632..5c021302d8d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,9 +598,6 @@ int X86TTIImpl::getArithmeticInstrCost(
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- // We only estimate the cost of reverse and alternate shuffles.
- if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
if (Kind == TTI::SK_Reverse) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -700,9 +697,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (const auto *Entry =
CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
- }
- if (Kind == TTI::SK_Alternate) {
+ } else if (Kind == TTI::SK_Alternate) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -792,7 +788,132 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+ } else if (Kind == TTI::SK_PermuteTwoSrc) {
+ // We assume that source and destination have the same vector type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ int NumOfDests = LT.first;
+ int NumOfShufflesPerDest = LT.first * 2 - 1;
+ int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return NumOfShuffles * Entry->Cost;
+
+ } else if (Kind == TTI::SK_PermuteSingleSrc) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (LT.first == 1) {
+
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw
+ {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16
+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd
+ {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps
+ {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq
+ {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb
+ };
+
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return Entry->Cost;
+
+ } else {
+ // We are going to permute multiple sources and the result will be in
+ // multiple destinations. Providing an accurate cost only for splits where
+ // the element type remains the same.
+
+ MVT LegalVT = LT.second;
+ if (LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+ unsigned NumOfDests = LT.first;
+
+ Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+
+ unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles *
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ }
+ }
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -1942,13 +2063,14 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
// Vector-4 of gather/scatter instruction does not exist on KNL.
// We can extend it to 8 elements, but zeroing upper bits of
// the mask vector will add more instructions. Right now we give the scalar
- // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
- // better in the VariableMask case.
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+ // is better in the VariableMask case.
if (VF == 2 || (VF == 4 && !ST->hasVLX()))
Scalarize = true;
if (Scalarize)
- return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+ AddressSpace);
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
@@ -2013,3 +2135,115 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
// As a temporary solution, disable on Atom.
return !(ST->isAtom() || ST->isSLM());
}
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ if (Opcode == Instruction::Load) {
+ // Kind of shuffle depends on number of loaded values.
+ // If we load the entire data in one register, we can use a 1-src shuffle.
+ // Otherwise, we'll merge 2 sources in each operation.
+ TTI::ShuffleKind ShuffleKind =
+ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+ unsigned ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+ unsigned NumOfLoadsInInterleaveGrp =
+ Indices.size() ? Indices.size() : Factor;
+ Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+ VecTy->getVectorNumElements() / Factor);
+ unsigned NumOfResults =
+ getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+ NumOfLoadsInInterleaveGrp;
+
+ // About a half of the loads may be folded in shuffles when we have only
+ // one result. If we have more than one result, we do not fold loads at all.
+ unsigned NumOfUnfoldedLoads =
+ NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+ // Get a number of shuffle operations per result.
+ unsigned NumOfShufflesPerResult =
+ std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // When we have more than one destination, we need additional instructions
+ // to keep sources.
+ unsigned NumOfMoves = 0;
+ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+ NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+ int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+ return Cost;
+ }
+
+ // Store.
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+
+ // There is no strided stores meanwhile. And store can't be folded in
+ // shuffle.
+ unsigned NumOfSources = Factor; // The number of values to be merged.
+ unsigned ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // We need additional instructions to keep sources.
+ unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+ int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
+ return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+ RequiresBW = false;
+ Type *EltTy = VecTy->getVectorElementType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+ EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+ return true;
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+ RequiresBW = true;
+ return true;
+ }
+ return false;
+ };
+ bool RequiresBW;
+ bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+ if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+ return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 4c256630e92..f6bcb9f569e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -80,6 +80,13 @@ public:
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+
int getIntImmCost(int64_t);
int getIntImmCost(const APInt &Imm, Type *Ty);
OpenPOWER on IntegriCloud