diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Analysis/TargetTransformInfo.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/CodeGenPrepare.cpp | 51 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 33 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/MergeICmps.cpp | 3 |
7 files changed, 74 insertions, 38 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c3185bf2bbd..53bedfe3f63 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -250,8 +250,9 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } -bool TargetTransformInfo::enableMemCmpExpansion(unsigned &MaxLoadSize) const { - return TTIImpl->enableMemCmpExpansion(MaxLoadSize); +const TargetTransformInfo::MemCmpExpansionOptions * +TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const { + return TTIImpl->enableMemCmpExpansion(IsZeroCmp); } bool TargetTransformInfo::enableInterleavedAccessVectorization() const { diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 1fbe57820c4..0a417e34084 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1758,9 +1758,10 @@ class MemCmpExpansion { Value *getMemCmpOneBlock(); public: - MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, - unsigned MaxNumLoads, unsigned NumLoadsPerBlock, - const DataLayout &DL); + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); unsigned getNumBlocks(); uint64_t getNumLoads() const { return LoadSequence.size(); } @@ -1778,29 +1779,32 @@ class MemCmpExpansion { // return from. // 3. ResultBlock, block to branch to for early exit when a // LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, - const unsigned MaxLoadSize, - const unsigned MaxNumLoads, - const unsigned LoadsPerBlock, - const DataLayout &TheDataLayout) +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) : CI(CI), Size(Size), - MaxLoadSize(MaxLoadSize), + MaxLoadSize(0), NumLoadsNonOneByte(0), - NumLoadsPerBlock(LoadsPerBlock), - IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) { assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. - while (this->MaxLoadSize > Size) { - this->MaxLoadSize /= 2; + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; // Compute the decomposition. - unsigned LoadSize = this->MaxLoadSize; uint64_t CurSize = Size; uint64_t Offset = 0; - while (CurSize) { + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; assert(LoadSize > 0 && "zero load size"); const uint64_t NumLoadsForThisSize = CurSize / LoadSize; if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { @@ -1821,11 +1825,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, } CurSize = CurSize % LoadSize; } - // FIXME: This can result in a non-native load size (e.g. X86-32+SSE can - // load 16 and 4 but not 8), which throws the load count off (e.g. in the - // aforementioned case, 16 bytes will count for 2 loads but will generate - // 4). - LoadSize /= 2; + ++LoadSizeIndex; } assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); } @@ -2362,15 +2362,16 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, } // TTI call to check if target would like to expand memcmp. Also, get the - // max LoadSize. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false; + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; const unsigned MaxNumLoads = TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MaxNumLoads, - MemCmpNumLoadsPerBlock, *DL); + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); // Don't expand if this will require more loads than desired by the target. if (Expansion.getNumLoads() == 0) { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 52c5b688d35..43b9892fc5a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -226,9 +226,17 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } -bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - MaxLoadSize = 8; - return true; +const PPCTTIImpl::TTI::MemCmpExpansionOptions * +PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + static const auto Options = []() { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes.push_back(8); + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return &Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 60dea0b0226..acf5066bc6d 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -63,7 +63,8 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f54728d4482..effbd07fa31 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2536,10 +2536,35 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, return (CallerBits & CalleeBits) == CalleeBits; } -bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - // TODO: We can increase these based on available vector ops. - MaxLoadSize = ST->is64Bit() ? 8 : 4; - return true; +const X86TTIImpl::TTI::MemCmpExpansionOptions * +X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + // Only enable vector loads for equality comparison. + // Right now the vector version is not as fast, see #33329. + static const auto ThreeWayOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + static const auto EqZeroOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + // TODO: enable AVX512 when the DAG is ready. + // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); + if (ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (ST->hasSSE2()) Options.LoadSizes.push_back(16); + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; } bool X86TTIImpl::enableInterleavedAccessVectorization() { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 0d2c90dc58b..5cb5c0cc298 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -127,7 +127,8 @@ public: bool hasDivRemOp(Type *DataType, bool IsSigned); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 1244a9776fa..f4de036059e 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -625,8 +625,7 @@ PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI, // We only try merging comparisons if the target wants to expand memcmp later. // The rationale is to avoid turning small chains into memcmp calls. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return PreservedAnalyses::all(); + if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all(); bool MadeChange = false; |

