summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorMohammed Agabaria <mohammed.agabaria@intel.com>2017-01-11 08:23:37 +0000
committerMohammed Agabaria <mohammed.agabaria@intel.com>2017-01-11 08:23:37 +0000
commit2c96c433881bc65a6808386fbe213ea8227b09f2 (patch)
tree318006563301f84c03e96ad3363732f3f2e971fa /llvm
parentbabd4476b2fa93ce2e746b9d8ca165c6fc3449d5 (diff)
downloadbcm5719-llvm-2c96c433881bc65a6808386fbe213ea8227b09f2.tar.gz
bcm5719-llvm-2c96c433881bc65a6808386fbe213ea8227b09f2.zip
[X86] updating TTI costs for arithmetic instructions on X86\SLM arch.
updated instructions: pmulld, pmullw, pmulhw, mulsd, mulps, mulpd, divss, divps, divsd, divpd, addpd and subpd. special optimization case which replaces pmulld with pmullw\pmulhw\pshuf seq. In case if the real operands bitwidth <= 16. Differential Revision: https://reviews.llvm.org/D28104 llvm-svn: 291657
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h14
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h60
-rw-r--r--llvm/include/llvm/CodeGen/BasicTTIImpl.h3
-rw-r--r--llvm/lib/Analysis/CostModel.cpp5
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp53
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7
-rw-r--r--llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll317
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll144
23 files changed, 616 insertions, 29 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b4a6c5c2fae..b63e8a22ce2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -518,11 +518,15 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF) const;
/// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+ /// \p Args is an optional argument which holds the instruction operands
+ /// values so the TTI can analyize those values searching for special
+ /// cases\optimizations based on those values.
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
OperandValueKind Opd2Info = OK_AnyValue,
OperandValueProperties Opd1PropInfo = OP_None,
- OperandValueProperties Opd2PropInfo = OP_None) const;
+ OperandValueProperties Opd2PropInfo = OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>()) const;
/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The index and subtype parameters are used by the subvector insertion and
@@ -763,7 +767,8 @@ public:
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info,
OperandValueProperties Opd1PropInfo,
- OperandValueProperties Opd2PropInfo) = 0;
+ OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) = 0;
virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) = 0;
virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
@@ -984,9 +989,10 @@ public:
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info,
OperandValueProperties Opd1PropInfo,
- OperandValueProperties Opd2PropInfo) override {
+ OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) override {
return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ Opd1PropInfo, Opd2PropInfo, Args);
}
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d7edbaf7df..cafc40723c9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -306,7 +306,8 @@ public:
TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info,
TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
return 1;
}
@@ -427,6 +428,63 @@ public:
return VF;
}
protected:
+ // Obtain the minimum required size to hold the value (without the sign)
+ // In case of a vector it returns the min required size for one element.
+ unsigned minRequiredElementSize(const Value* Val, bool &isSigned) {
+ if (isa<ConstantDataVector>(Val) || isa<ConstantVector>(Val)) {
+ const auto* VectorValue = cast<Constant>(Val);
+
+ // In case of a vector need to pick the max between the min
+ // required size for each element
+ auto *VT = cast<VectorType>(Val->getType());
+
+ // Assume unsigned elements
+ isSigned = false;
+
+ // The max required size is the total vector width divided by num
+ // of elements in the vector
+ unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements();
+
+ unsigned MinRequiredSize = 0;
+ for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) {
+ if (auto* IntElement =
+ dyn_cast<ConstantInt>(VectorValue->getAggregateElement(i))) {
+ bool signedElement = IntElement->getValue().isNegative();
+ // Get the element min required size.
+ unsigned ElementMinRequiredSize =
+ IntElement->getValue().getMinSignedBits() - 1;
+ // In case one element is signed then all the vector is signed.
+ isSigned |= signedElement;
+ // Save the max required bit size between all the elements.
+ MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize);
+ }
+ else {
+ // not an int constant element
+ return MaxRequiredSize;
+ }
+ }
+ return MinRequiredSize;
+ }
+
+ if (const auto* CI = dyn_cast<ConstantInt>(Val)) {
+ isSigned = CI->getValue().isNegative();
+ return CI->getValue().getMinSignedBits() - 1;
+ }
+
+ if (const auto* Cast = dyn_cast<SExtInst>(Val)) {
+ isSigned = true;
+ return Cast->getSrcTy()->getScalarSizeInBits() - 1;
+ }
+
+ if (const auto* Cast = dyn_cast<ZExtInst>(Val)) {
+ isSigned = false;
+ return Cast->getSrcTy()->getScalarSizeInBits();
+ }
+
+ isSigned = false;
+ return Val->getType()->getScalarSizeInBits();
+ }
+
bool isStridedAccess(const SCEV *Ptr) {
return Ptr && isa<SCEVAddRecExpr>(Ptr);
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 8e96336b981..7efdbcccdef 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -308,7 +308,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
// Check if any of the operands are vector operands.
const TargetLoweringBase *TLI = getTLI();
int ISD = TLI->InstructionOpcodeToISD(Opcode);
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 67d1773f081..6b77397956c 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -438,8 +438,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
getOperandInfo(I->getOperand(0));
TargetTransformInfo::OperandValueKind Op2VK =
getOperandInfo(I->getOperand(1));
+ SmallVector<const Value*, 2> Operands(I->operand_values());
return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
- Op2VK);
+ Op2VK, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None,
+ Operands);
}
case Instruction::Select: {
const SelectInst *SI = cast<SelectInst>(I);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index cd8c24630df..5c0d1aac1b9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -277,9 +277,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
int TargetTransformInfo::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
- OperandValueProperties Opd2PropInfo) const {
+ OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) const {
int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ Opd1PropInfo, Opd2PropInfo, Args);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1b8eb6fd7e0..b8833e5a555 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
int AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 849fd3d9b44..18287ed6653 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -102,7 +102,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a1a35264224..e9048706599 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1177007644f..0d83b2a585b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getCFInstrCost(unsigned Opcode);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc001b59678..2b6b36bc3e6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -433,7 +433,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
int ARMTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 731a5adf3d7..3c83cd92a61 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,7 +114,8 @@ public:
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7fcb3ce45bb..d95c16fc3ca 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
switch (ISD) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 48928ee2d54..dd770708494 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
int NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d953aa8a719..b6c271ae4cb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
};
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f7785342b36..f94d1eab097 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -281,7 +281,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
// Fallback to the default implementation.
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8308086ccfa..30ee2814aba 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -71,7 +71,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index bf546dab5fb..47aadf99e86 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2a2e3941f82..f658609f893 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
/// @}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 107ed935937..586786d29e9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry SLMCostTable[] = {
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ // v2i64/v4i64 mul is custom lowered as a series of long
+ // multiplies(3), shifts(3) and adds(2).
+ // slm muldq version throughput is 2
+ { ISD::MUL, MVT::v2i64, 11 },
+ };
+
+ if (ST->isSLM()) {
+ if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+ // Check if the operands can be shrinked into a smaller datatype.
+ bool Op1Signed = false;
+ unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+ bool Op2Signed = false;
+ unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+ bool signedMode = Op1Signed | Op2Signed;
+ unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+ if (OpMinSize <= 7)
+ return LT.first * 3; // pmullw/sext
+ if (!signedMode && OpMinSize <= 8)
+ return LT.first * 3; // pmullw/zext
+ if (OpMinSize <= 15)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ if (!signedMode && OpMinSize <= 16)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ }
+ if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+ LT.second)) {
+ return LT.first * Entry->Cost;
+ }
+ }
+
if (ISD == ISD::SDIV &&
Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index c013805f432..ecaaf951cff 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9819a8e270..1b1f86f8efd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
@@ -6949,9 +6950,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
} else if (Legal->isUniform(Op2)) {
Op2VK = TargetTransformInfo::OK_UniformValue;
}
-
- return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(I->operand_values());
+ return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
new file mode 100644
index 00000000000..3673a5d9e06
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -0,0 +1,317 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; 8bit mul
+define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) {
+entry:
+; SLM: cost of 1 {{.*}} mul nsw i8
+ %res = mul nsw i8 %a, %b
+ ret i8 %res
+}
+
+define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <2 x i8>
+ %res = mul nsw <2 x i8> %a, %b
+ ret <2 x i8> %res
+}
+
+define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) {
+entry:
+; SLM: cost of 3 {{.*}} mul nsw <4 x i8>
+ %res = mul nsw <4 x i8> %a, %b
+ ret <4 x i8> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a) {
+entry:
+; SLM: cost of 3 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 255, i32 255>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 -1, i32 255>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 255, i32 256, i32 255, i32 255>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a) {
+entry:
+; SLM: cost of 3 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 127, i32 -128>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 128, i32 -128>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i8> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 127, i32 -129, i32 127, i32 -128>
+ ret <4 x i32> %res
+}
+
+define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) {
+entry:
+; SLM: cost of 2 {{.*}} mul nsw <8 x i8>
+ %res = mul nsw <8 x i8> %a, %b
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; SLM: cost of 14 {{.*}} mul nsw <16 x i8>
+ %res = mul nsw <16 x i8> %a, %b
+ ret <16 x i8> %res
+}
+
+; 16bit mul
+define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b) {
+entry:
+; SLM: cost of 1 {{.*}} mul nsw i16
+ %res = mul nsw i16 %a, %b
+ ret i16 %res
+}
+
+define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <2 x i16>
+ %res = mul nsw <2 x i16> %a, %b
+ ret <2 x i16> %res
+}
+
+define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i16>
+ %res = mul nsw <4 x i16> %a, %b
+ ret <4 x i16> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 65535, i32 65535, i32 65535, i32 65535>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 -1, i32 65535, i32 65535, i32 65535>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
+ %zext = zext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %zext, <i32 65536, i32 65535, i32 65535, i32 65535>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a) {
+entry:
+; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32768>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32768, i32 -32768>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
+ %sext = sext <4 x i16> %a to <4 x i32>
+ %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32769>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; SLM: cost of 2 {{.*}} mul nsw <8 x i16>
+ %res = mul nsw <8 x i16> %a, %b
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b) {
+entry:
+; SLM: cost of 4 {{.*}} mul nsw <16 x i16>
+ %res = mul nsw <16 x i16> %a, %b
+ ret <16 x i16> %res
+}
+
+; 32bit mul
+define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b) {
+entry:
+; SLM: cost of 1 {{.*}} mul nsw i32
+ %res = mul nsw i32 %a, %b
+ ret i32 %res
+}
+
+define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <2 x i32>
+ %res = mul nsw <2 x i32> %a, %b
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
+ %res = mul nsw <4 x i32> %a, %b
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b) {
+entry:
+; SLM: cost of 22 {{.*}} mul nsw <8 x i32>
+ %res = mul nsw <8 x i32> %a, %b
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b) {
+entry:
+; SLM: cost of 44 {{.*}} mul nsw <16 x i32>
+ %res = mul nsw <16 x i32> %a, %b
+ ret <16 x i32> %res
+}
+
+; 64bit mul
+define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b) {
+entry:
+; SLM: cost of 1 {{.*}} mul nsw i64
+ %res = mul nsw i64 %a, %b
+ ret i64 %res
+}
+
+define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM: cost of 11 {{.*}} mul nsw <2 x i64>
+ %res = mul nsw <2 x i64> %a, %b
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) {
+entry:
+; SLM: cost of 22 {{.*}} mul nsw <4 x i64>
+ %res = mul nsw <4 x i64> %a, %b
+ ret <4 x i64> %res
+}
+
+define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) {
+entry:
+; SLM: cost of 44 {{.*}} mul nsw <8 x i64>
+ %res = mul nsw <8 x i64> %a, %b
+ ret <8 x i64> %res
+}
+
+define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) {
+entry:
+; SLM: cost of 88 {{.*}} mul nsw <16 x i64>
+ %res = mul nsw <16 x i64> %a, %b
+ ret <16 x i64> %res
+}
+
+; mulsd
+define double @slm-costs_mulsd(double %a, double %b) {
+entry:
+; SLM: cost of 2 {{.*}} fmul double
+ %res = fmul double %a, %b
+ ret double %res
+}
+
+; mulpd
+define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b) {
+entry:
+; SLM: cost of 4 {{.*}} fmul <2 x double>
+ %res = fmul <2 x double> %a, %b
+ ret <2 x double> %res
+}
+
+; mulps
+define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b) {
+entry:
+; SLM: cost of 2 {{.*}} fmul <4 x float>
+ %res = fmul <4 x float> %a, %b
+ ret <4 x float> %res
+}
+
+; divss
+define float @slm-costs_divss(float %a, float %b) {
+entry:
+; SLM: cost of 17 {{.*}} fdiv float
+ %res = fdiv float %a, %b
+ ret float %res
+}
+
+; divps
+define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b) {
+entry:
+; SLM: cost of 39 {{.*}} fdiv <4 x float>
+ %res = fdiv <4 x float> %a, %b
+ ret <4 x float> %res
+}
+
+; divsd
+define double @slm-costs_divsd(double %a, double %b) {
+entry:
+; SLM: cost of 32 {{.*}} fdiv double
+ %res = fdiv double %a, %b
+ ret double %res
+}
+
+; divpd
+define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b) {
+entry:
+; SLM: cost of 69 {{.*}} fdiv <2 x double>
+ %res = fdiv <2 x double> %a, %b
+ ret <2 x double> %res
+}
+
+; addpd
+define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b) {
+entry:
+; SLM: cost of 2 {{.*}} fadd <2 x double>
+ %res = fadd <2 x double> %a, %b
+ ret <2 x double> %res
+}
+
+; subpd
+define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b) {
+entry:
+; SLM: cost of 2 {{.*}} fsub <2 x double>
+ %res = fsub <2 x double> %a, %b
+ ret <2 x double> %res
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
new file mode 100644
index 00000000000..8cbe97dbb34
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,144 @@
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+ %cmp12 = icmp eq i32 %N, 0
+ br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %phitmp = trunc i32 %add4 to i8
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+ ret i8 %acc.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+ %0 = load i8, i8* %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+ %1 = load i8, i8* %arrayidx2, align 1
+ %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8
+; use pmullw\sext seq.
+; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
+ %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %conv4 = zext i8 %1 to i32
+ %mul2 = mul nsw i32 %conv4, %conv
+ %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
+ %conv5 = zext i8 %0 to i32
+ %mul3 = mul nsw i32 %conv5, %conv4
+ %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
+ %mul4 = mul nsw i32 -120, %conv3
+ %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %mul5 = mul nsw i32 250, %conv3
+ %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %mul6 = mul nsw i32 -120, %conv4
+ %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
+ %mul7 = mul nsw i32 250, %conv4
+ %sum5 = add i32 %sum4, %mul7
+ %add = add i32 %acc.013, 5
+ %add4 = add i32 %add, %sum5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+ %cmp12 = icmp eq i32 %N, 0
+ br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %phitmp = trunc i32 %add4 to i16
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+ ret i16 %acc.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+ %0 = load i16, i16* %arrayidx, align 1
+ %conv = sext i16 %0 to i32
+ %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+ %1 = load i16, i16* %arrayidx2, align 1
+ %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16
+; use pmulhw\pmullw\pshuf seq.
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
+ %conv4 = zext i16 %1 to i32
+ %mul2 = mul nsw i32 %conv4, %conv
+ %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %conv5 = zext i16 %0 to i32
+ %mul3 = mul nsw i32 %conv5, %conv4
+ %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %mul4 = mul nsw i32 -32000, %conv3
+ %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
+ %mul5 = mul nsw i32 64000, %conv3
+ %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
+ %mul6 = mul nsw i32 -32000, %conv4
+ %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
+ %mul7 = mul nsw i32 250, %conv4
+ %sum5 = add i32 %sum4, %mul7
+ %add = add i32 %acc.013, 5
+ %add4 = add i32 %add, %sum5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
OpenPOWER on IntegriCloud