summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorJonas Paulsson <paulsson@linux.vnet.ibm.com>2017-04-12 11:49:08 +0000
committerJonas Paulsson <paulsson@linux.vnet.ibm.com>2017-04-12 11:49:08 +0000
commitfccc7d66c3baf6e80a80b40bf7af640b500ef112 (patch)
tree810267aff8342234b15a2f182b899c32f716a997 /llvm/lib
parent4ed589d8d6c00dd8e2b05ea4405ae948ac86477b (diff)
downloadbcm5719-llvm-fccc7d66c3baf6e80a80b40bf7af640b500ef112.tar.gz
bcm5719-llvm-fccc7d66c3baf6e80a80b40bf7af640b500ef112.zip
[SystemZ] TargetTransformInfo cost functions implemented.
getArithmeticInstrCost(), getShuffleCost(), getCastInstrCost(), getCmpSelInstrCost(), getVectorInstrCost(), getMemoryOpCost(), getInterleavedMemoryOpCost() implemented. Interleaved access vectorization enabled. BasicTTIImpl::getCastInstrCost() improved to check for legal extending loads, in which case the cost of the z/sext instruction becomes 0. Review: Ulrich Weigand, Renato Golin. https://reviews.llvm.org/D29631 llvm-svn: 300052
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CostModel.cpp14
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp549
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h27
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h8
-rw-r--r--llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/BBVectorize.cpp17
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp14
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp16
17 files changed, 665 insertions, 69 deletions
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 757a1e50284..32bfea58bf9 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
case Instruction::Select: {
const SelectInst *SI = cast<SelectInst>(I);
Type *CondTy = SI->getCondition()->getType();
- return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+ return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
- return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+ return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
}
case Instruction::Store: {
const StoreInst *SI = cast<StoreInst>(I);
Type *ValTy = SI->getValueOperand()->getType();
return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
- SI->getAlignment(),
- SI->getPointerAddressSpace());
+ SI->getAlignment(),
+ SI->getPointerAddressSpace(), I);
}
case Instruction::Load: {
const LoadInst *LI = cast<LoadInst>(I);
return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
- LI->getAlignment(),
- LI->getPointerAddressSpace());
+ LI->getAlignment(),
+ LI->getPointerAddressSpace(), I);
}
case Instruction::ZExt:
case Instruction::SExt:
@@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
case Instruction::BitCast:
case Instruction::AddrSpaceCast: {
Type *SrcTy = I->getOperand(0)->getType();
- return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+ return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
}
case Instruction::ExtractElement: {
const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0771eea85e0..c8b8740bd85 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -314,8 +314,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
}
int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
- Type *Src) const {
- int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+ Type *Src, const Instruction *I) const {
+ assert ((I == nullptr || I->getOpcode() == Opcode) &&
+ "Opcode should reflect passed instruction.");
+ int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
@@ -335,8 +337,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
}
int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) const {
- int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ Type *CondTy, const Instruction *I) const {
+ assert ((I == nullptr || I->getOpcode() == Opcode) &&
+ "Opcode should reflect passed instruction.");
+ int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
@@ -350,8 +354,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment,
- unsigned AddressSpace) const {
- int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+ unsigned AddressSpace,
+ const Instruction *I) const {
+ assert ((I == nullptr || I->getOpcode() == Opcode) &&
+ "Opcode should reflect passed instruction.");
+ int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 531cb9790d3..4d59da0c646 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+ Type *CondTy, const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
@@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return Entry->Cost;
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
- unsigned Alignment, unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ const Instruction *I) {
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index db017258c83..e37c003e064 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -86,7 +86,8 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
unsigned Index);
@@ -103,10 +104,11 @@ public:
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, const Instruction *I = nullptr);
int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index f2662a68b18..8eb9dbf5f9d 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -92,7 +92,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
}
-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -310,7 +311,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// On NEON a a vector select gets lowered to vbsl.
@@ -335,7 +337,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
return LT.first;
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -504,7 +506,7 @@ int ARMTTIImpl::getArithmeticInstrCost(
}
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace, const Instruction *I) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
if (Src->isVectorTy() && Alignment != 16 &&
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 5ee1f7c4b0e..7de0543dfa5 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -94,9 +94,11 @@ public:
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -114,7 +116,7 @@ public:
ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, const Instruction *I = nullptr);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2f1bceaa481..7ee1317bf72 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -302,14 +302,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first;
}
-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I) {
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -352,7 +354,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
}
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace, const Instruction *I) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 30ee2814aba..6ce70fbd877 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -74,11 +74,13 @@ public:
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, const Instruction *I = nullptr);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 140ee29e5f1..84d3c7bed50 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -347,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// There should be no need to check for float types other than v2f64
// since <2 x f32> isn't a legal type.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
}
// Handle floating-point types.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b10c0e09a0d..e74c9a80515 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
}
}
if (isa<StoreInst>(&I)) {
- NumStores++;
Type *MemAccessTy = I.getOperand(0)->getType();
- if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
- (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
- NumStores++; // 128 bit fp/int stores get split.
+ NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
}
}
@@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
return 0;
}
+int SystemZTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
+
+ // TODO: return a good value for BB-VECTORIZER that includes the
+ // immediate loads, which we do not want to count for the loop
+ // vectorizer, since they are hopefully hoisted out of the loop. This
+ // would require a new parameter 'InLoop', but not sure if constant
+ // args are common enough to motivate this.
+
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+ if (Ty->isVectorTy()) {
+ assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+ unsigned VF = Ty->getVectorNumElements();
+ unsigned NumVectors = getNumberOfParts(Ty);
+
+ // These vector operations are custom handled, but are still supported
+ // with one instruction per vector, regardless of element size.
+ if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+ Opcode == Instruction::AShr) {
+ return NumVectors;
+ }
+
+ // These FP operations are supported with a single vector instruction for
+ // double (base implementation assumes float generally costs 2). For
+ // FP128, the scalar cost is 1, and there is no overhead since the values
+ // are already in scalar registers.
+ if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+ switch (ScalarBits) {
+ case 32: {
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values.
+ unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+ unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+ // FIXME: VF 2 for these FP operations are currently just as
+ // expensive as for VF 4.
+ if (VF == 2)
+ Cost *= 2;
+ return Cost;
+ }
+ case 64:
+ case 128:
+ return NumVectors;
+ default:
+ break;
+ }
+ }
+
+ // There is no native support for FRem.
+ if (Opcode == Instruction::FRem) {
+ unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+ // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+ if (VF == 2 && ScalarBits == 32)
+ Cost *= 2;
+ return Cost;
+ }
+ }
+ else { // Scalar:
+ // These FP operations are supported with a dedicated instruction for
+ // float, double and fp128 (base implementation assumes float generally
+ // costs 2).
+ if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+ return 1;
+
+ // There is no native support for FRem.
+ if (Opcode == Instruction::FRem)
+ return LIBCALL_COST;
+
+ if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+ return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+ // Or requires one instruction, although it has custom handling for i64.
+ if (Opcode == Instruction::Or)
+ return 1;
+
+ if (Opcode == Instruction::Xor && ScalarBits == 1)
+ // 2 * ipm sequences ; xor ; shift ; compare
+ return 7;
+
+ // An extra extension for narrow types is needed.
+ if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+ // sext of op(s) for narrow types
+ return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+ if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+ // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+ return (ScalarBits < 32 ? 4 : 2);
+ }
+
+ // Fallback to the default implementation.
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ assert (Tp->isVectorTy());
+ assert (ST->hasVector() && "getShuffleCost() called.");
+ unsigned NumVectors = getNumberOfParts(Tp);
+
+ // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+ // FP128 values are always in scalar registers, so there is no work
+ // involved with a shuffle, except for broadcast. In that case register
+ // moves are done with a single instruction per element.
+ if (Tp->getScalarType()->isFP128Ty())
+ return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+ switch (Kind) {
+ case TargetTransformInfo::SK_ExtractSubvector:
+ // ExtractSubvector Index indicates start offset.
+
+ // Extracting a subvector from first index is a noop.
+ return (Index == 0 ? 0 : NumVectors);
+
+ case TargetTransformInfo::SK_Broadcast:
+ // Loop vectorizer calls here to figure out the extra cost of
+ // broadcasting a loaded value to all elements of a vector. Since vlrep
+ // loads and replicates with a single instruction, adjust the returned
+ // value.
+ return NumVectors - 1;
+
+ default:
+
+ // SystemZ supports single instruction permutation / replication.
+ return NumVectors;
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+ unsigned Bits0 = Ty0->getScalarSizeInBits();
+ unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+ if (Bits1 > Bits0)
+ return (Log2_32(Bits1) - Log2_32(Bits0));
+
+ return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+ assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+ assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+ "Packing must reduce size of vector type.");
+ assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+ "Packing should not change number of elements.");
+
+ // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+ unsigned NumParts = getNumberOfParts(SrcTy);
+ if (NumParts <= 2)
+ // Up to 2 vector registers can be truncated efficiently with pack or
+ // permute. The latter requires an immediate mask to be loaded, which
+ // typically gets hoisted out of a loop. TODO: return a good value for
+ // BB-VECTORIZER that includes the immediate loads, which we do not want
+ // to count for the loop vectorizer.
+ return 1;
+
+ unsigned Cost = 0;
+ unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+ unsigned VF = SrcTy->getVectorNumElements();
+ for (unsigned P = 0; P < Log2Diff; ++P) {
+ if (NumParts > 1)
+ NumParts /= 2;
+ Cost += NumParts;
+ }
+
+ // Currently, a general mix of permutes and pack instructions is output by
+ // isel, which follow the cost computation above except for this case which
+ // is one instruction less:
+ if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+ DstTy->getScalarSizeInBits() == 8)
+ Cost--;
+
+ return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+ assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+ "Should only be called with vector types.");
+
+ unsigned PackCost = 0;
+ unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+ unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+ unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+ if (SrcScalarBits > DstScalarBits)
+ // The bitmask will be truncated.
+ PackCost = getVectorTruncCost(SrcTy, DstTy);
+ else if (SrcScalarBits < DstScalarBits) {
+ unsigned DstNumParts = getNumberOfParts(DstTy);
+ // Each vector select needs its part of the bitmask unpacked.
+ PackCost = Log2Diff * DstNumParts;
+ // Extra cost for moving part of mask before unpacking.
+ PackCost += DstNumParts - 1;
+ }
+
+ return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+ Type *OpTy = nullptr;
+ if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+ OpTy = CI->getOperand(0)->getType();
+ else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+ if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+ if (isa<CmpInst>(LogicI->getOperand(1)))
+ OpTy = CI0->getOperand(0)->getType();
+
+ if (OpTy != nullptr) {
+ if (VF == 1) {
+ assert (!OpTy->isVectorTy() && "Expected scalar type");
+ return OpTy;
+ }
+ // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
+ // be either scalar or already vectorized with a same or lesser VF.
+ Type *ElTy = OpTy->getScalarType();
+ return VectorType::get(ElTy, VF);
+ }
+
+ return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
+ unsigned DstScalarBits = Dst->getScalarSizeInBits();
+ unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+ if (Src->isVectorTy()) {
+ assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+ assert (Dst->isVectorTy());
+ unsigned VF = Src->getVectorNumElements();
+ unsigned NumDstVectors = getNumberOfParts(Dst);
+ unsigned NumSrcVectors = getNumberOfParts(Src);
+
+ if (Opcode == Instruction::Trunc) {
+ if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+ return 0; // Check for NOOP conversions.
+ return getVectorTruncCost(Src, Dst);
+ }
+
+ if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+ if (SrcScalarBits >= 8) {
+ // ZExt/SExt will be handled with one unpack per doubling of width.
+ unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+ // For types that spans multiple vector registers, some additional
+ // instructions are used to setup the unpacking.
+ unsigned NumSrcVectorOps =
+ (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+ : (NumDstVectors / 2));
+
+ return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+ }
+ else if (SrcScalarBits == 1) {
+ // This should be extension of a compare i1 result.
+ // If we know what the widths of the compared operands, get the
+ // cost of converting it to Dst. Otherwise assume same widths.
+ unsigned Cost = 0;
+ Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+ if (CmpOpTy != nullptr)
+ Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+ if (Opcode == Instruction::ZExt)
+ // One 'vn' per dst vector with an immediate mask.
+ Cost += NumDstVectors;
+ return Cost;
+ }
+ }
+
+ if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+ Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+ // TODO: Fix base implementation which could simplify things a bit here
+ // (seems to miss on differentiating on scalar/vector types).
+
+ // Only 64 bit vector conversions are natively supported.
+ if (SrcScalarBits == 64 && DstScalarBits == 64)
+ return NumDstVectors;
+
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values. Base implementation does not
+ // realize float->int gets scalarized.
+ unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+ Src->getScalarType());
+ unsigned TotCost = VF * ScalarCost;
+ bool NeedsInserts = true, NeedsExtracts = true;
+ // FP128 registers do not get inserted or extracted.
+ if (DstScalarBits == 128 &&
+ (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+ NeedsInserts = false;
+ if (SrcScalarBits == 128 &&
+ (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+ NeedsExtracts = false;
+
+ TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+ // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+ if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+ TotCost *= 2;
+
+ return TotCost;
+ }
+
+ if (Opcode == Instruction::FPTrunc) {
+ if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
+ return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+ else // double -> float
+ return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+ }
+
+ if (Opcode == Instruction::FPExt) {
+ if (SrcScalarBits == 32 && DstScalarBits == 64) {
+ // float -> double is very rare and currently unoptimized. Instead of
+ // using vldeb, which can do two at a time, all conversions are
+ // scalarized.
+ return VF * 2;
+ }
+ // -> fp128. VF * lxdb/lxeb + extraction of elements.
+ return VF + getScalarizationOverhead(Src, false, true);
+ }
+ }
+ else { // Scalar
+ assert (!Dst->isVectorTy());
+
+ if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+ return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+
+ if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+ Src->isIntegerTy(1)) {
+ // This should be extension of a compare i1 result, which is done with
+ // ipm and a varying sequence of instructions.
+ unsigned Cost = 0;
+ if (Opcode == Instruction::SExt)
+ Cost = (DstScalarBits < 64 ? 3 : 4);
+ if (Opcode == Instruction::ZExt)
+ Cost = 3;
+ Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+ if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+ // If operands of an fp-type was compared, this costs +1.
+ Cost++;
+
+ return Cost;
+ }
+ }
+
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I) {
+ if (ValTy->isVectorTy()) {
+ assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+ assert (CondTy == nullptr || CondTy->isVectorTy());
+ unsigned VF = ValTy->getVectorNumElements();
+
+ // Called with a compare instruction.
+ if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+ unsigned PredicateExtraCost = 0;
+ if (I != nullptr) {
+ // Some predicates cost one or two extra instructions.
+ switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+ case CmpInst::Predicate::ICMP_NE:
+ case CmpInst::Predicate::ICMP_UGE:
+ case CmpInst::Predicate::ICMP_ULE:
+ case CmpInst::Predicate::ICMP_SGE:
+ case CmpInst::Predicate::ICMP_SLE:
+ PredicateExtraCost = 1;
+ break;
+ case CmpInst::Predicate::FCMP_ONE:
+ case CmpInst::Predicate::FCMP_ORD:
+ case CmpInst::Predicate::FCMP_UEQ:
+ case CmpInst::Predicate::FCMP_UNO:
+ PredicateExtraCost = 2;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+ // floats. FIXME: <2 x float> generates same code as <4 x float>.
+ unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+ unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+ unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+ return Cost;
+ }
+ else { // Called with a select instruction.
+ assert (Opcode == Instruction::Select);
+
+ // We can figure out the extra cost of packing / unpacking if the
+ // instruction was passed and the compare instruction is found.
+ unsigned PackCost = 0;
+ Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+ if (CmpOpTy != nullptr)
+ PackCost =
+ getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+ return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+ }
+ }
+ else { // Scalar
+ switch (Opcode) {
+ case Instruction::ICmp: {
+ unsigned Cost = 1;
+ if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+ Cost += 2; // extend both operands
+ return Cost;
+ }
+ case Instruction::Select:
+ if (ValTy->isFloatingPointTy())
+ return 4; // No load on condition for FP, so this costs a conditional jump.
+ return 1; // Load On Condition.
+ }
+ }
+
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ // vlvgp will insert two grs into a vector register, so only count half the
+ // number of instructions.
+ if (Opcode == Instruction::InsertElement &&
+ Val->getScalarType()->isIntegerTy(64))
+ return ((Index % 2 == 0) ? 1 : 0);
+
+ if (Opcode == Instruction::ExtractElement) {
+ int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+ // Give a slight penalty for moving out of vector pipeline to FXU unit.
+ if (Index == 0 && Val->getScalarType()->isIntegerTy())
+ Cost += 1;
+
+ return Cost;
+ }
+
+ return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment, unsigned AddressSpace,
+ const Instruction *I) {
+ assert(!Src->isVoidTy() && "Invalid type");
+
+ if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+ I != nullptr && I->hasOneUse()) {
+ const Instruction *UserI = cast<Instruction>(*I->user_begin());
+ unsigned Bits = Src->getScalarSizeInBits();
+ bool FoldsLoad = false;
+ switch (UserI->getOpcode()) {
+ case Instruction::ICmp:
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ // This also makes sense for float operations, but disabled for now due
+ // to regressions.
+ // case Instruction::FCmp:
+ // case Instruction::FAdd:
+ // case Instruction::FSub:
+ // case Instruction::FMul:
+ // case Instruction::FDiv:
+ FoldsLoad = (Bits == 32 || Bits == 64);
+ break;
+ }
+
+ if (FoldsLoad) {
+ assert (UserI->getNumOperands() == 2 &&
+ "Expected to only handle binops.");
+
+ // UserI can't fold two loads, so in that case return 0 cost only
+ // half of the time.
+ for (unsigned i = 0; i < 2; ++i) {
+ if (UserI->getOperand(i) == I)
+ continue;
+ if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+ if (LI->hasOneUse())
+ return i == 0;
+ }
+ }
+
+ return 0;
+ }
+ }
+
+ unsigned NumOps = getNumberOfParts(Src);
+
+ if (Src->getScalarSizeInBits() == 128)
+ // 128 bit scalars are held in a pair of two 64 bit registers.
+ NumOps *= 2;
+
+ return NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(isa<VectorType>(VecTy) &&
+ "Expect a vector type for interleaved memory op");
+
+ unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+ (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+ assert (WideBits > 0 && "Could not compute size of vector");
+ int NumWideParts =
+ ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+ // How many source vectors are handled to produce a vectorized operand?
+ int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+ int NumSrcParts =
+ ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+ // A Load group may have gaps.
+ unsigned NumOperands =
+ ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+ // Each needed permute takes two vectors as input.
+ if (NumSrcParts > 1)
+ NumSrcParts--;
+ int NumPermutes = NumSrcParts * NumOperands;
+
+ // Cost of load/store operations and the permutations needed.
+ return NumWideParts + NumPermutes;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f7d2d827f11..d2639cb271d 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
const SystemZSubtarget *getST() const { return ST; }
const SystemZTargetLowering *getTLI() const { return TLI; }
+ unsigned const LIBCALL_COST = 30;
+
public:
explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -53,6 +55,31 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
+ bool enableInterleavedAccessVectorization() { return true; }
+
+ int getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+ TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+ unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace, const Instruction *I = nullptr);
+
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ea8aa5cb61e..b742fb47237 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -938,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -1304,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -1370,7 +1372,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
@@ -1615,7 +1617,7 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
}
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace, const Instruction *I) {
// Handle non-power-of-two vectors such as <3 x float>
if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
unsigned NumElem = VTy->getVectorNumElements();
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 2aa94fdc3c2..9bef9e80c39 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -62,11 +62,13 @@ public:
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 7223370a717..f344eb15146 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1882,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
"non noop cast is found during rematerialization");
Type *SrcTy = CI->getOperand(0)->getType();
- Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+ Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
// Cost of the address calculation
diff --git a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index 9e97d9a65c8..c83b3f7b225 100644
--- a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -550,7 +550,8 @@ namespace {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_AnyValue) {
+ TargetTransformInfo::OK_AnyValue,
+ const Instruction *I = nullptr) {
switch (Opcode) {
default: break;
case Instruction::GetElementPtr:
@@ -584,7 +585,7 @@ namespace {
case Instruction::Select:
case Instruction::ICmp:
case Instruction::FCmp:
- return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+ return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -598,7 +599,7 @@ namespace {
case Instruction::FPTrunc:
case Instruction::BitCast:
case Instruction::ShuffleVector:
- return TTI->getCastInstrCost(Opcode, T1, T2);
+ return TTI->getCastInstrCost(Opcode, T1, T2, I);
}
return 1;
@@ -1044,14 +1045,14 @@ namespace {
return false;
}
} else if (TTI) {
- unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
- unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
- Type *VT1 = getVecTypeForPair(IT1, JT1),
- *VT2 = getVecTypeForPair(IT2, JT2);
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
+ Type *VT1 = getVecTypeForPair(IT1, JT1),
+ *VT2 = getVecTypeForPair(IT2, JT2);
// On some targets (example X86) the cost of a vector shift may vary
// depending on whether the second operand is a Uniform or
@@ -1090,7 +1091,7 @@ namespace {
// but this cost is ignored (because insert and extract element
// instructions are assigned a zero depth factor and are not really
// fused in general).
- unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
if (VCost > ICost + JCost)
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fff1e29ecbb..f891cd9d62c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7048,7 +7048,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
Cost += VF *
TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
- AS);
+ AS, I);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -7078,7 +7078,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
else
- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
@@ -7154,7 +7154,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned AS = getMemInstAlignment(I);
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
}
return getWideningCost(I, VF);
}
@@ -7369,7 +7369,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -7378,7 +7378,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
}
case Instruction::Store:
case Instruction::Load: {
@@ -7403,7 +7403,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
- Trunc->getSrcTy());
+ Trunc->getSrcTy(), Trunc);
}
Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -7427,7 +7427,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
}
- return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+ return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
}
case Instruction::Call: {
bool NeedToScalarize;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b9df89e3eec..df7dc2ac67f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1762,10 +1762,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy);
+ VL0->getType(), SrcTy, VL0);
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::FCmp:
@@ -1774,8 +1774,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
- int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+ int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::Add:
@@ -1858,18 +1858,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Cost of wide load - cost of scalar loads.
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
int ScalarLdCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
- VecTy, alignment, 0);
+ VecTy, alignment, 0, VL0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
int ScalarStCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, alignment, 0);
+ VecTy, alignment, 0, VL0);
return VecStCost - ScalarStCost;
}
case Instruction::Call: {
OpenPOWER on IntegriCloud