summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorZaara Syeda <syzaara@ca.ibm.com>2017-05-31 17:12:38 +0000
committerZaara Syeda <syzaara@ca.ibm.com>2017-05-31 17:12:38 +0000
commit3a7578c6589b910f9a04bae7f7f121dfe3281578 (patch)
treed9717d369740af26067db3f584bf73a07d6167bd /llvm
parent6ad77845e29dd0b85a9d21dce8f06bf569999a5d (diff)
downloadbcm5719-llvm-3a7578c6589b910f9a04bae7f7f121dfe3281578.tar.gz
bcm5719-llvm-3a7578c6589b910f9a04bae7f7f121dfe3281578.zip
[PPC] Inline expansion of memcmp
This patch does an inline expansion of memcmp. It changes the memcmp library call into an inline expansion when the size is known at compile time and is under a target specified threshold. This expansion is implemented in CodeGenPrepare and expands into straight line code. The target specifies a maximum load size and the expansion works by using this size to load the two sources, compare, and exit early if a difference is found. It also has a special case when the memcmp result is used in a compare to zero equality. Differential Revision: https://reviews.llvm.org/D28637 llvm-svn: 304313
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h7
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h2
-rw-r--r--llvm/include/llvm/Analysis/ValueTracking.h2
-rw-r--r--llvm/include/llvm/Target/TargetLowering.h12
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp12
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp612
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp5
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h1
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp14
-rw-r--r--llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll121
-rw-r--r--llvm/test/CodeGen/PowerPC/memcmp.ll87
-rw-r--r--llvm/test/CodeGen/PowerPC/memcmpIR.ll194
15 files changed, 1066 insertions, 18 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 6cbe3a1f515..7211508e975 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -454,6 +454,9 @@ public:
/// \brief Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;
+ /// \brief Enable inline expansion of memcmp
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const;
+
/// \brief Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;
@@ -828,6 +831,7 @@ public:
unsigned VF) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+ virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -1047,6 +1051,9 @@ public:
bool enableAggressiveInterleaving(bool LoopHasReductions) override {
return Impl.enableAggressiveInterleaving(LoopHasReductions);
}
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override {
+ return Impl.expandMemCmp(I, MaxLoadSize);
+ }
bool enableInterleavedAccessVectorization() override {
return Impl.enableInterleavedAccessVectorization();
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index ad1a7cb748f..d73a60eba85 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -274,6 +274,8 @@ public:
bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; }
+
bool enableInterleavedAccessVectorization() { return false; }
bool isFPVectorizationPotentiallyUnsafe() { return false; }
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index b1ee76159c4..612779b1ce8 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -85,6 +85,8 @@ template <typename T> class ArrayRef;
const Instruction *CxtI = nullptr,
const DominatorTree *DT = nullptr);
+ bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
+
/// Return true if the given value is known to be non-zero when defined. For
/// vectors, return true if every element is known to be non-zero when
/// defined. For pointers, if the context instruction and dominator tree are
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 17182b958ec..7258a5cc2d8 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -1143,6 +1143,16 @@ public:
return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
}
+ /// Get maximum # of load operations permitted for memcmp
+ ///
+ /// This function returns the maximum number of load operations permitted
+ /// to replace a call to memcmp. The value is set by the target at the
+ /// performance threshold for such a replacement. If OptSize is true,
+ /// return the limit for functions that have OptSize attribute.
+ unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
+ return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
+ }
+
/// \brief Get maximum # of store operations permitted for llvm.memmove
///
/// This function returns the maximum number of store operations permitted
@@ -2330,6 +2340,8 @@ protected:
/// Maximum number of store operations that may be substituted for a call to
/// memcpy, used for functions with OptSize attribute.
unsigned MaxStoresPerMemcpyOptSize;
+ unsigned MaxLoadsPerMemcmp;
+ unsigned MaxLoadsPerMemcmpOptSize;
/// \brief Specify maximum bytes of store instructions per memmove call.
///
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7a8d4f3be24..ac646716476 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -215,6 +215,10 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c
return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
}
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const {
+ return TTIImpl->expandMemCmp(I, MaxLoadSize);
+}
+
bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
return TTIImpl->enableInterleavedAccessVectorization();
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index bd79cd56a18..4bf889c5a54 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -172,6 +172,18 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
}
+bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
+ for (const User *U : CxtI->users()) {
+ if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+ if (IC->isEquality())
+ if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+ if (C->isNullValue())
+ continue;
+ return false;
+ }
+ return true;
+}
+
static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
const Query &Q);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4e85708efaf..568b278dd47 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -24,12 +24,13 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -60,6 +61,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
+
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -84,6 +86,12 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+ "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
static cl::opt<bool> DisableBranchOpts(
"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -144,6 +152,11 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
cl::desc("Enable merging of redundant sexts when one is dominating"
" the other."), cl::init(true));
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+ "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+ cl::desc("The number of loads per basic block for inline expansion of "
+ "memcmp that is only being compared against zero."));
+
namespace {
typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -1629,6 +1642,593 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
return true;
}
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+ struct ResultBlock {
+ BasicBlock *BB;
+ PHINode *PhiSrc1;
+ PHINode *PhiSrc2;
+ ResultBlock();
+ };
+
+ CallInst *CI;
+ ResultBlock ResBlock;
+ unsigned MaxLoadSize;
+ unsigned NumBlocks;
+ unsigned NumBlocksNonOneByte;
+ unsigned NumLoadsPerBlock;
+ std::vector<BasicBlock *> LoadCmpBlocks;
+ BasicBlock *EndBlock;
+ PHINode *PhiRes;
+ bool IsUsedForZeroCmp;
+ int calculateNumBlocks(unsigned Size);
+ void createLoadCmpBlocks();
+ void createResultBlock();
+ void setupResultBlockPHINodes();
+ void setupEndBlockPHINodes();
+ void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex,
+ bool IsLittleEndian);
+ void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
+ unsigned &NumBytesProcessed);
+ void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
+ void emitMemCmpResultBlock(bool IsLittleEndian);
+ Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
+ unsigned getLoadSize(unsigned Size);
+ unsigned getNumLoads(unsigned Size);
+
+public:
+ MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+ unsigned NumLoadsPerBlock);
+ Value *getMemCmpExpansion(bool IsLittleEndian);
+};
+
+MemCmpExpansion::ResultBlock::ResultBlock()
+ : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+ unsigned NumLoadsPerBlock)
+ : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) {
+
+ IRBuilder<> Builder(CI->getContext());
+
+ BasicBlock *StartBlock = CI->getParent();
+ EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+ setupEndBlockPHINodes();
+ IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+
+ ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ uint64_t Size = SizeCast->getZExtValue();
+
+ // Calculate how many load compare blocks are required for an expansion of
+ // given Size.
+ NumBlocks = calculateNumBlocks(Size);
+ createResultBlock();
+
+ // If return value of memcmp is not used in a zero equality, we need to
+ // calculate which source was larger. The calculation requires the
+ // two loaded source values of each load compare block.
+ // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+ if (!IsUsedForZeroCmp)
+ setupResultBlockPHINodes();
+
+ // Create the number of required load compare basic blocks.
+ createLoadCmpBlocks();
+
+ // Update the terminator added by splitBasicBlock to branch to the first
+ // LoadCmpBlock.
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+ StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+ for (unsigned i = 0; i < NumBlocks; i++) {
+ BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+ EndBlock->getParent(), EndBlock);
+ LoadCmpBlocks.push_back(BB);
+ }
+}
+
+void MemCmpExpansion::createResultBlock() {
+ ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+ EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp paramters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) {
+ IRBuilder<> Builder(CI->getContext());
+
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+ Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+ // Cast source to LoadSizeType*
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using the GEPIndex
+ if (GEPIndex != 0) {
+ Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ }
+
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+ LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+ Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+ PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
+
+ if (Index < (LoadCmpBlocks.size() - 1)) {
+ // Early exit branch if difference found to EndBlock, otherwise continue to
+ // next LoadCmpBlock
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+ ConstantInt::get(Diff->getType(), 0));
+ BranchInst *CmpBr =
+ BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp);
+ Builder.Insert(CmpBr);
+ } else {
+ // The last block has an unconditional branch to EndBlock
+ BranchInst *CmpBr = BranchInst::Create(EndBlock);
+ Builder.Insert(CmpBr);
+ }
+}
+
+unsigned MemCmpExpansion::getNumLoads(unsigned Size) {
+ return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize);
+}
+
+unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
+ return MinAlign(PowerOf2Floor(Size), MaxLoadSize);
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
+ unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
+
+ IRBuilder<> Builder(CI->getContext());
+
+ std::vector<Value *> XorList, OrList;
+ Value *Diff;
+
+ unsigned RemainingBytes = Size - NumBytesProcessed;
+ unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
+ unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
+
+ Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ unsigned LoadSize = getLoadSize(RemainingBytes);
+ unsigned GEPIndex = NumBytesProcessed / LoadSize;
+ NumBytesProcessed += LoadSize;
+ RemainingBytes -= LoadSize;
+
+ Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ // Cast source to LoadSizeType*
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using the GEPIndex
+ if (GEPIndex != 0) {
+ Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ }
+
+ // Load LoadSizeType from the base address
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+ if (LoadSizeType != MaxLoadType) {
+ LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+ LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+ }
+ Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+ Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType);
+ XorList.push_back(Diff);
+ }
+
+ auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+ std::vector<Value *> OutList;
+ for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+ Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+ OutList.push_back(Or);
+ }
+ if (InList.size() % 2 != 0)
+ OutList.push_back(InList.back());
+ return OutList;
+ };
+
+ // Pair wise OR the XOR results
+ OrList = pairWiseOr(XorList);
+
+ // Pair wise OR the OR results until one result left
+ while (OrList.size() != 1) {
+ OrList = pairWiseOr(OrList);
+ }
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0],
+ ConstantInt::get(Diff->getType(), 0));
+ BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+ ? EndBlock
+ : LoadCmpBlocks[Index + 1];
+ // Early exit branch if difference found to ResultBlock, otherwise continue to
+ // next LoadCmpBlock or EndBlock.
+ BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+ Builder.Insert(CmpBr);
+
+ // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+ // since early exit to ResultBlock was not taken (no difference was found in
+ // any of the bytes)
+ if (Index == LoadCmpBlocks.size() - 1) {
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+ PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+ }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
+ int GEPIndex, bool IsLittleEndian) {
+ if (LoadSize == 1) {
+ MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
+ return;
+ }
+
+ IRBuilder<> Builder(CI->getContext());
+
+ Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+ // Cast source to LoadSizeType*
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using the GEPIndex
+ if (GEPIndex != 0) {
+ Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ }
+
+ // Load LoadSizeType from the base address
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ if (IsLittleEndian) {
+ Function *F = LoadCmpBlocks[Index]->getParent();
+
+ Function *Bswap = Intrinsic::getDeclaration(F->getParent(),
+ Intrinsic::bswap, LoadSizeType);
+ LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+ LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+ }
+
+ if (LoadSizeType != MaxLoadType) {
+ LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+ LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+ }
+
+ // Add the loaded values to the phi nodes for calculating memcmp result only
+ // if result is not used in a zero equality.
+ if (!IsUsedForZeroCmp) {
+ ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
+ ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
+ }
+
+ Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+ ConstantInt::get(Diff->getType(), 0));
+ BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+ ? EndBlock
+ : LoadCmpBlocks[Index + 1];
+ // Early exit branch if difference found to ResultBlock, otherwise continue to
+ // next LoadCmpBlock or EndBlock.
+ BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+ Builder.Insert(CmpBr);
+
+ // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+ // since early exit to ResultBlock was not taken (no difference was found in
+ // any of the bytes)
+ if (Index == LoadCmpBlocks.size() - 1) {
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+ PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+ }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) {
+ IRBuilder<> Builder(CI->getContext());
+
+ // Special case: if memcmp result is used in a zero equality, result does not
+ // need to be calculated and can simply return 1.
+ if (IsUsedForZeroCmp) {
+ BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+ Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+ Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+ PhiRes->addIncoming(Res, ResBlock.BB);
+ BranchInst *NewBr = BranchInst::Create(EndBlock);
+ Builder.Insert(NewBr);
+ return;
+ }
+ BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+ Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+ ResBlock.PhiSrc2);
+
+ Value *Res =
+ Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+ ConstantInt::get(Builder.getInt32Ty(), 1));
+
+ BranchInst *NewBr = BranchInst::Create(EndBlock);
+ Builder.Insert(NewBr);
+ PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+int MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+ int NumBlocks = 0;
+ bool haveOneByteLoad = false;
+ unsigned RemainingSize = Size;
+ unsigned LoadSize = MaxLoadSize;
+ while (RemainingSize) {
+ if (LoadSize == 1)
+ haveOneByteLoad = true;
+ NumBlocks += RemainingSize / LoadSize;
+ RemainingSize = RemainingSize % LoadSize;
+ LoadSize = LoadSize / 2;
+ }
+ NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
+
+ if (IsUsedForZeroCmp)
+ NumBlocks = NumBlocks / NumLoadsPerBlock +
+ (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0);
+
+ return NumBlocks;
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+ IRBuilder<> Builder(CI->getContext());
+ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ Builder.SetInsertPoint(ResBlock.BB);
+ ResBlock.PhiSrc1 =
+ Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
+ ResBlock.PhiSrc2 =
+ Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+ IRBuilder<> Builder(CI->getContext());
+
+ Builder.SetInsertPoint(&EndBlock->front());
+ PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
+ bool IsLittleEndian) {
+ unsigned NumBytesProcessed = 0;
+ // This loop populates each of the LoadCmpBlocks with IR sequence to handle
+ // multiple loads per block
+ for (unsigned i = 0; i < NumBlocks; ++i) {
+ emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed);
+ }
+
+ emitMemCmpResultBlock(IsLittleEndian);
+ return PhiRes;
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) {
+
+ ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ uint64_t Size = SizeCast->getZExtValue();
+
+ int LoadSize = MaxLoadSize;
+ int NumBytesToBeProcessed = Size;
+
+ if (IsUsedForZeroCmp) {
+ return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
+ }
+
+ unsigned Index = 0;
+ // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two
+ // memcmp source. It starts with loading using the maximum load size set by
+ // the target. It processes any remaining bytes using a load size which is the
+ // next smallest power of 2.
+ while (NumBytesToBeProcessed) {
+ // Calculate how many blocks we can create with the current load size
+ int NumBlocks = NumBytesToBeProcessed / LoadSize;
+ int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+ NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
+
+ // For each NumBlocks, populate the instruction sequence for loading and
+ // comparing LoadSize bytes
+ while (NumBlocks--) {
+ emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian);
+ Index++;
+ GEPIndex++;
+ }
+ // Get the next LoadSize to use
+ LoadSize = LoadSize / 2;
+ }
+
+ emitMemCmpResultBlock(IsLittleEndian);
+ return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+/// %0 = bitcast i32* %buffer2 to i8*
+/// %1 = bitcast i32* %buffer1 to i8*
+/// %2 = bitcast i8* %1 to i64*
+/// %3 = bitcast i8* %0 to i64*
+/// %4 = load i64, i64* %2
+/// %5 = load i64, i64* %3
+/// %6 = call i64 @llvm.bswap.i64(i64 %4)
+/// %7 = call i64 @llvm.bswap.i64(i64 %5)
+/// %8 = sub i64 %6, %7
+/// %9 = icmp ne i64 %8, 0
+/// br i1 %9, label %res_block, label %loadbb1
+/// res_block: ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+/// %10 = icmp ult i64 %phi.src1, %phi.src2
+/// %11 = select i1 %10, i32 -1, i32 1
+/// br label %endblock
+/// loadbb1: ; preds = %loadbb
+/// %12 = bitcast i32* %buffer2 to i8*
+/// %13 = bitcast i32* %buffer1 to i8*
+/// %14 = bitcast i8* %13 to i32*
+/// %15 = bitcast i8* %12 to i32*
+/// %16 = getelementptr i32, i32* %14, i32 2
+/// %17 = getelementptr i32, i32* %15, i32 2
+/// %18 = load i32, i32* %16
+/// %19 = load i32, i32* %17
+/// %20 = call i32 @llvm.bswap.i32(i32 %18)
+/// %21 = call i32 @llvm.bswap.i32(i32 %19)
+/// %22 = zext i32 %20 to i64
+/// %23 = zext i32 %21 to i64
+/// %24 = sub i64 %22, %23
+/// %25 = icmp ne i64 %24, 0
+/// br i1 %25, label %res_block, label %loadbb2
+/// loadbb2: ; preds = %loadbb1
+/// %26 = bitcast i32* %buffer2 to i8*
+/// %27 = bitcast i32* %buffer1 to i8*
+/// %28 = bitcast i8* %27 to i16*
+/// %29 = bitcast i8* %26 to i16*
+/// %30 = getelementptr i16, i16* %28, i16 6
+/// %31 = getelementptr i16, i16* %29, i16 6
+/// %32 = load i16, i16* %30
+/// %33 = load i16, i16* %31
+/// %34 = call i16 @llvm.bswap.i16(i16 %32)
+/// %35 = call i16 @llvm.bswap.i16(i16 %33)
+/// %36 = zext i16 %34 to i64
+/// %37 = zext i16 %35 to i64
+/// %38 = sub i64 %36, %37
+/// %39 = icmp ne i64 %38, 0
+/// br i1 %39, label %res_block, label %loadbb3
+/// loadbb3: ; preds = %loadbb2
+/// %40 = bitcast i32* %buffer2 to i8*
+/// %41 = bitcast i32* %buffer1 to i8*
+/// %42 = getelementptr i8, i8* %41, i8 14
+/// %43 = getelementptr i8, i8* %40, i8 14
+/// %44 = load i8, i8* %42
+/// %45 = load i8, i8* %43
+/// %46 = zext i8 %44 to i32
+/// %47 = zext i8 %45 to i32
+/// %48 = sub i32 %46, %47
+/// br label %endblock
+/// endblock: ; preds = %res_block,
+/// %loadbb3
+/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+/// ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+ const TargetLowering *TLI, const DataLayout *DL) {
+ NumMemCmpCalls++;
+ IRBuilder<> Builder(CI->getContext());
+
+ // TTI call to check if target would like to expand memcmp and get the
+ // MaxLoadSize
+ unsigned MaxLoadSize;
+ if (!TTI->expandMemCmp(CI, MaxLoadSize))
+ return false;
+
+ // Early exit from expansion if -Oz
+ if (CI->getParent()->getParent()->optForMinSize()) {
+ return false;
+ }
+
+ // Early exit from expansion if size is not a constant
+ ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ if (!SizeCast) {
+ NumMemCmpNotConstant++;
+ return false;
+ }
+
+ // Early exit from expansion if size greater than max bytes to load
+ uint64_t SizeVal = SizeCast->getZExtValue();
+
+ unsigned NumLoads = 0;
+ unsigned RemainingSize = SizeVal;
+ unsigned LoadSize = MaxLoadSize;
+ while (RemainingSize) {
+ NumLoads += RemainingSize / LoadSize;
+ RemainingSize = RemainingSize % LoadSize;
+ LoadSize = LoadSize / 2;
+ }
+
+ if (NumLoads >
+ TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) {
+ NumMemCmpGreaterThanMax++;
+ return false;
+ }
+
+ NumMemCmpInlined++;
+
+ // MemCmpHelper object, creates and sets up basic blocks required for
+ // expanding memcmp with size SizeVal
+ unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock;
+ MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock);
+
+ Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian());
+
+ // Replace call with result of expansion and erarse call.
+ CI->replaceAllUsesWith(Res);
+ CI->eraseFromParent();
+
+ return true;
+}
+
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
BasicBlock *BB = CI->getParent();
@@ -1780,6 +2380,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
CI->eraseFromParent();
return true;
}
+
+ LibFunc Func;
+ if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) &&
+ Func == LibFunc_memcmp) {
+ if (expandMemCmp(CI, TTI, TLI, DL)) {
+ ModifiedDT = true;
+ return true;
+ }
+ }
return false;
}
@@ -4927,6 +5536,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
return true;
}
+
namespace {
/// \brief Helper class to promote a scalar operation to a vector one.
/// This class is used to move downward extractelement transition.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 0def5ae6d0d..3fc6bbc123f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -842,9 +842,10 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
initActions();
// Perform these initializations only once.
- MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
- MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
- = MaxStoresPerMemmoveOptSize = 4;
+ MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+ MaxLoadsPerMemcmp = 8;
+ MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+ MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
UseUnderscoreSetJmp = false;
UseUnderscoreLongJmp = false;
HasMultipleConditionRegisters = false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 91356fcec45..41ff9d903aa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
MaxStoresPerMemset = 128;
MaxStoresPerMemcpy = 128;
MaxStoresPerMemmove = 128;
+ MaxLoadsPerMemcmp = 128;
+ } else {
+ MaxLoadsPerMemcmp = 8;
+ MaxLoadsPerMemcmpOptSize = 4;
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 7ee1317bf72..83768da3a9a 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
return LoopHasReductions;
}
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+ MaxLoadSize = 8;
+ return true;
+}
+
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
return true;
}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 6ce70fbd877..2e0116fee04 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,7 @@ public:
/// @{
bool enableAggressiveInterleaving(bool LoopHasReductions);
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 49effda5d83..cc6c47e8f97 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -85,20 +85,6 @@ static bool isCallingConvCCompatible(CallInst *CI) {
return false;
}
-/// Return true if it only matters that the value is equal or not-equal to zero.
-static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
- for (User *U : V->users()) {
- if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
- if (IC->isEquality())
- if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
- if (C->isNullValue())
- continue;
- // Unknown instruction.
- return false;
- }
- return true;
-}
-
/// Return true if it is only used in equality comparisons with With.
static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
for (User *U : V->users()) {
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
new file mode 100644
index 00000000000..3095429758f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,121 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+; Validate with if(memcmp())
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
+ %not.tobool = icmp ne i32 %call, 0
+ %. = zext i1 %not.tobool to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest01
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
+
+; Validate with if(memcmp() == 0)
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+ %not.cmp = icmp ne i32 %call, 0
+ %. = zext i1 %not.cmp to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest02
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
+
+; Validate with > 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+ %not.cmp = icmp slt i32 %call, 1
+ %. = zext i1 %not.cmp to i32
+ ret i32 %.
+
+ ; CHECK-LABEL: @zeroEqualityTest03
+ ; CHECK-LABEL: %res_block
+ ; CHECK: cmpld
+ ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+ ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+ ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with < 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
+ %call.lobit = lshr i32 %call, 31
+ %call.lobit.not = xor i32 %call.lobit, 1
+ ret i32 %call.lobit.not
+
+ ; CHECK-LABEL: @zeroEqualityTest04
+ ; CHECK-LABEL: %res_block
+ ; CHECK: cmpld
+ ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+ ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+ ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+ %not.tobool = icmp eq i32 %call, 0
+ %cond = zext i1 %not.tobool to i32
+ ret i32 %cond
+
+ ; CHECK-LABEL: @zeroEqualityTest05
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK: li 3, 0
+}
+
+; Validate with !memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
+entry:
+ %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+ %not.lnot = icmp ne i32 %call, 0
+ %cond = zext i1 %not.lnot to i32
+ ret i32 %cond
+
+ ; CHECK-LABEL: @zeroEqualityTest06
+ ; CHECK-LABEL: %res_block
+ ; CHECK: li 3, 1
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+ ; CHECK: li 3, 0
+ ; CHECK-NEXT: clrldi
+ ; CHECK-NEXT: blr
+}
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
new file mode 100644
index 00000000000..bae713cb207
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
+
+; Check size 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ldbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 4
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 2
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test3
+; CHECK: lhbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 1
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
+ ret i32 %call
+
+; CHECK-LABEL: @test4
+; CHECK: lbz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
new file mode 100644
index 00000000000..f052cc258df
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
@@ -0,0 +1,194 @@
+; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
+; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
+
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+entry:
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+ ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+ ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
+ ret i32 %call
+}
+
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
+ ret i32 %call
+}
+
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-LABEL: res_block:{{.*}}
+ ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+ ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
+ ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+ ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+ ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE-LABEL: res_block:{{.*}}
+ ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+ ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+ ; CHECK-BE-NEXT: br label %endblock
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+ ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label
+
+ ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
+ ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+ ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+ ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+ ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+ ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+ ret i32 %call
+}
+ ; CHECK: call = tail call signext i32 @memcmp
+ ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
+
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
+ ret i32 %call
+}
+
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) {
+ ; CHECK: call = tail call signext i32 @memcmp
+ ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+ %0 = bitcast i32* %buffer1 to i8*
+ %1 = bitcast i32* %buffer2 to i8*
+ %conv = sext i32 %SIZE to i64
+ %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
+ ret i32 %call
+}
OpenPOWER on IntegriCloud