diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 24 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 8 |
4 files changed, 35 insertions, 14 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index f409459047a..56a6d57c195 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -308,6 +308,14 @@ def FeatureMacroFusion : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", "Various instructions can be fused with conditional branches">; +// Gather is available since Haswell (AVX2 set). So technically, we can +// generate Gathers on all AVX2 processors. But the overhead on HSW is high. +// Skylake Client processor has faster Gathers than HSW and performance is +// similar to Skylake Server (AVX-512). +def FeatureHasFastGather + : SubtargetFeature<"fast-gather", "HasFastGather", "true", + "Indicates if gather is reasonably fast.">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -613,7 +621,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel, SKLFeatures.Value, [ - ProcIntelSKL + ProcIntelSKL, + FeatureHasFastGather ]>; def : SkylakeClientProc<"skylake">; @@ -637,7 +646,8 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, KNLFeatures.Value, [ ProcIntelKNL, FeatureSlowTwoMemOps, - FeatureFastPartialYMMorZMMWrite + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather ]>; def : KnightsLandingProc<"knl">; @@ -646,6 +656,7 @@ class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel, ProcIntelKNL, FeatureSlowTwoMemOps, FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather, FeatureVPOPCNTDQ ]>; def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features @@ -662,7 +673,8 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, SKXFeatures.Value, [ - ProcIntelSKX + ProcIntelSKX, + FeatureHasFastGather ]>; def : SkylakeServerProc<"skylake-avx512">; def : SkylakeServerProc<"skx">; // Legacy alias. @@ -675,7 +687,8 @@ def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [ class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel, CNLFeatures.Value, [ - ProcIntelCNL + ProcIntelCNL, + FeatureHasFastGather ]>; def : CannonlakeProc<"cannonlake">; @@ -691,7 +704,8 @@ def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [ class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel, ICLFeatures.Value, [ - ProcIntelICL + ProcIntelICL, + FeatureHasFastGather ]>; def : IcelakeProc<"icelake">; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8543d189cdb..0f995404618 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -270,14 +270,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { isTargetKFreeBSD() || In64BitMode) stackAlignment = 16; - // Gather is available since Haswell (AVX2 set). So technically, we can - // generate Gathers on all AVX2 processors. But the overhead on HSW is high. - // Skylake Client processor has faster Gathers than HSW and performance is - // similar to Skylake Server (AVX-512). The specified overhead is relative to - // the Load operation. "2" is the number provided by Intel architects. This + // Some CPUs have more overhead for gather. The specified overhead is relative + // to the Load operation. "2" is the number provided by Intel architects. This // parameter is used for cost estimation of Gather Op and comparison with // other alternatives. - if (X86ProcFamily == IntelSkylake || hasAVX512()) + // TODO: Remove the explicit hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (hasAVX512() || (hasAVX2() && hasFastGather())) GatherOverhead = 2; if (hasAVX512()) ScatterOverhead = 2; @@ -345,6 +344,7 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasFastPartialYMMorZMMWrite = false; + HasFastGather = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; HasFastLZCNT = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index c7fc5617831..faf88aba177 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -229,6 +229,10 @@ protected: /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; + /// True if gather is reasonably fast. This is true for Skylake client and + /// all AVX-512 CPUs. + bool HasFastGather; + /// True if hardware SQRTSS instruction is at least as fast (latency) as /// RSQRTSS followed by a Newton-Raphson iteration. bool HasFastScalarFSQRT; @@ -514,6 +518,7 @@ public: bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } + bool hasFastGather() const { return HasFastGather; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f878f050e4e..d06d6a5d180 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2518,9 +2518,11 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { int DataWidth = isa<PointerType>(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 and Skylake AVX2 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() || - ST->getProcFamily() == X86Subtarget::IntelSkylake); + // Some CPUs have better gather performance than others. + // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only + // enable gather with a -march. + return (DataWidth == 32 || DataWidth == 64) && + (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { |