diff options
author | Jonas Paulsson <paulsson@linux.vnet.ibm.com> | 2017-04-12 11:49:08 +0000 |
---|---|---|
committer | Jonas Paulsson <paulsson@linux.vnet.ibm.com> | 2017-04-12 11:49:08 +0000 |
commit | fccc7d66c3baf6e80a80b40bf7af640b500ef112 (patch) | |
tree | 810267aff8342234b15a2f182b899c32f716a997 /llvm/lib | |
parent | 4ed589d8d6c00dd8e2b05ea4405ae948ac86477b (diff) | |
download | bcm5719-llvm-fccc7d66c3baf6e80a80b40bf7af640b500ef112.tar.gz bcm5719-llvm-fccc7d66c3baf6e80a80b40bf7af640b500ef112.zip |
[SystemZ] TargetTransformInfo cost functions implemented.
getArithmeticInstrCost(), getShuffleCost(), getCastInstrCost(),
getCmpSelInstrCost(), getVectorInstrCost(), getMemoryOpCost(),
getInterleavedMemoryOpCost() implemented.
Interleaved access vectorization enabled.
BasicTTIImpl::getCastInstrCost() improved to check for legal extending loads,
in which case the cost of the z/sext instruction becomes 0.
Review: Ulrich Weigand, Renato Golin.
https://reviews.llvm.org/D29631
llvm-svn: 300052
Diffstat (limited to 'llvm/lib')
17 files changed, 665 insertions, 69 deletions
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index 757a1e50284..32bfea58bf9 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { case Instruction::Select: { const SelectInst *SI = cast<SelectInst>(I); Type *CondTy = SI->getCondition()->getType(); - return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); - return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy); + return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I); } case Instruction::Store: { const StoreInst *SI = cast<StoreInst>(I); Type *ValTy = SI->getValueOperand()->getType(); return TTI->getMemoryOpCost(I->getOpcode(), ValTy, - SI->getAlignment(), - SI->getPointerAddressSpace()); + SI->getAlignment(), + SI->getPointerAddressSpace(), I); } case Instruction::Load: { const LoadInst *LI = cast<LoadInst>(I); return TTI->getMemoryOpCost(I->getOpcode(), I->getType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); + LI->getAlignment(), + LI->getPointerAddressSpace(), I); } case Instruction::ZExt: case Instruction::SExt: @@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { case Instruction::BitCast: case Instruction::AddrSpaceCast: { Type *SrcTy = I->getOperand(0)->getType(); - return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy); + return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I); } case Instruction::ExtractElement: { const ExtractElementInst * EEI = cast<ExtractElementInst>(I); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0771eea85e0..c8b8740bd85 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -314,8 +314,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index, } int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const { - int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src); + Type *Src, const Instruction *I) const { + assert ((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -335,8 +337,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const { } int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { - int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy); + Type *CondTy, const Instruction *I) const { + assert ((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -350,8 +354,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const { - int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + unsigned AddressSpace, + const Instruction *I) const { + assert ((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 531cb9790d3..4d59da0c646 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { + Type *CondTy, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, - unsigned Alignment, unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + const Instruction *I) { auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index db017258c83..e37c003e064 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -86,7 +86,8 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); @@ -103,10 +104,11 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index f2662a68b18..8eb9dbf5f9d 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -92,7 +92,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, } -int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -310,7 +311,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } -int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { +int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // On NEON a a vector select gets lowered to vbsl. @@ -335,7 +337,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return LT.first; } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -504,7 +506,7 @@ int ARMTTIImpl::getArithmeticInstrCost( } int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Src->isVectorTy() && Alignment != 16 && diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 5ee1f7c4b0e..7de0543dfa5 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -94,9 +94,11 @@ public: int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); @@ -114,7 +116,7 @@ public: ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2f1bceaa481..7ee1317bf72 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -302,14 +302,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first; } -int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -352,7 +354,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { } int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 30ee2814aba..6ce70fbd877 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -74,11 +74,13 @@ public: TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 140ee29e5f1..84d3c7bed50 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -347,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // There should be no need to check for float types other than v2f64 // since <2 x f32> isn't a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal); } // Handle floating-point types. diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index b10c0e09a0d..e74c9a80515 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, } } if (isa<StoreInst>(&I)) { - NumStores++; Type *MemAccessTy = I.getOperand(0)->getType(); - if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) && - (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128)) - NumStores++; // 128 bit fp/int stores get split. + NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0); } } @@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) { return 0; } +int SystemZTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args) { + + // TODO: return a good value for BB-VECTORIZER that includes the + // immediate loads, which we do not want to count for the loop + // vectorizer, since they are hopefully hoisted out of the loop. This + // would require a new parameter 'InLoop', but not sure if constant + // args are common enough to motivate this. + + unsigned ScalarBits = Ty->getScalarSizeInBits(); + + if (Ty->isVectorTy()) { + assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type."); + unsigned VF = Ty->getVectorNumElements(); + unsigned NumVectors = getNumberOfParts(Ty); + + // These vector operations are custom handled, but are still supported + // with one instruction per vector, regardless of element size. + if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || + Opcode == Instruction::AShr) { + return NumVectors; + } + + // These FP operations are supported with a single vector instruction for + // double (base implementation assumes float generally costs 2). For + // FP128, the scalar cost is 1, and there is no overhead since the values + // are already in scalar registers. + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { + switch (ScalarBits) { + case 32: { + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args); + // FIXME: VF 2 for these FP operations are currently just as + // expensive as for VF 4. + if (VF == 2) + Cost *= 2; + return Cost; + } + case 64: + case 128: + return NumVectors; + default: + break; + } + } + + // There is no native support for FRem. + if (Opcode == Instruction::FRem) { + unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args); + // FIXME: VF 2 for float is currently just as expensive as for VF 4. + if (VF == 2 && ScalarBits == 32) + Cost *= 2; + return Cost; + } + } + else { // Scalar: + // These FP operations are supported with a dedicated instruction for + // float, double and fp128 (base implementation assumes float generally + // costs 2). + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FMul || Opcode == Instruction::FDiv) + return 1; + + // There is no native support for FRem. + if (Opcode == Instruction::FRem) + return LIBCALL_COST; + + if (Opcode == Instruction::LShr || Opcode == Instruction::AShr) + return (ScalarBits >= 32 ? 1 : 2 /*ext*/); + + // Or requires one instruction, although it has custom handling for i64. + if (Opcode == Instruction::Or) + return 1; + + if (Opcode == Instruction::Xor && ScalarBits == 1) + // 2 * ipm sequences ; xor ; shift ; compare + return 7; + + // An extra extension for narrow types is needed. + if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem)) + // sext of op(s) for narrow types + return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1)); + + if (Opcode == Instruction::UDiv || Opcode == Instruction::URem) + // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r + return (ScalarBits < 32 ? 4 : 2); + } + + // Fallback to the default implementation. + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo, Args); +} + + +int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + assert (Tp->isVectorTy()); + assert (ST->hasVector() && "getShuffleCost() called."); + unsigned NumVectors = getNumberOfParts(Tp); + + // TODO: Since fp32 is expanded, the shuffle cost should always be 0. + + // FP128 values are always in scalar registers, so there is no work + // involved with a shuffle, except for broadcast. In that case register + // moves are done with a single instruction per element. + if (Tp->getScalarType()->isFP128Ty()) + return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); + + switch (Kind) { + case TargetTransformInfo::SK_ExtractSubvector: + // ExtractSubvector Index indicates start offset. + + // Extracting a subvector from first index is a noop. + return (Index == 0 ? 0 : NumVectors); + + case TargetTransformInfo::SK_Broadcast: + // Loop vectorizer calls here to figure out the extra cost of + // broadcasting a loaded value to all elements of a vector. Since vlrep + // loads and replicates with a single instruction, adjust the returned + // value. + return NumVectors - 1; + + default: + + // SystemZ supports single instruction permutation / replication. + return NumVectors; + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} + +// Return the log2 difference of the element sizes of the two vector types. +static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { + unsigned Bits0 = Ty0->getScalarSizeInBits(); + unsigned Bits1 = Ty1->getScalarSizeInBits(); + + if (Bits1 > Bits0) + return (Log2_32(Bits1) - Log2_32(Bits0)); + + return (Log2_32(Bits0) - Log2_32(Bits1)); +} + +// Return the number of instructions needed to truncate SrcTy to DstTy. +unsigned SystemZTTIImpl:: +getVectorTruncCost(Type *SrcTy, Type *DstTy) { + assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); + assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() && + "Packing must reduce size of vector type."); + assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() && + "Packing should not change number of elements."); + + // TODO: Since fp32 is expanded, the extract cost should always be 0. + + unsigned NumParts = getNumberOfParts(SrcTy); + if (NumParts <= 2) + // Up to 2 vector registers can be truncated efficiently with pack or + // permute. The latter requires an immediate mask to be loaded, which + // typically gets hoisted out of a loop. TODO: return a good value for + // BB-VECTORIZER that includes the immediate loads, which we do not want + // to count for the loop vectorizer. + return 1; + + unsigned Cost = 0; + unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); + unsigned VF = SrcTy->getVectorNumElements(); + for (unsigned P = 0; P < Log2Diff; ++P) { + if (NumParts > 1) + NumParts /= 2; + Cost += NumParts; + } + + // Currently, a general mix of permutes and pack instructions is output by + // isel, which follow the cost computation above except for this case which + // is one instruction less: + if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && + DstTy->getScalarSizeInBits() == 8) + Cost--; + + return Cost; +} + +// Return the cost of converting a vector bitmask produced by a compare +// (SrcTy), to the type of the select or extend instruction (DstTy). +unsigned SystemZTTIImpl:: +getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) { + assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && + "Should only be called with vector types."); + + unsigned PackCost = 0; + unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); + unsigned DstScalarBits = DstTy->getScalarSizeInBits(); + unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); + if (SrcScalarBits > DstScalarBits) + // The bitmask will be truncated. + PackCost = getVectorTruncCost(SrcTy, DstTy); + else if (SrcScalarBits < DstScalarBits) { + unsigned DstNumParts = getNumberOfParts(DstTy); + // Each vector select needs its part of the bitmask unpacked. + PackCost = Log2Diff * DstNumParts; + // Extra cost for moving part of mask before unpacking. + PackCost += DstNumParts - 1; + } + + return PackCost; +} + +// Return the type of the compared operands. This is needed to compute the +// cost for a Select / ZExt or SExt instruction. +static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { + Type *OpTy = nullptr; + if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) + OpTy = CI->getOperand(0)->getType(); + else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) + if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) + if (isa<CmpInst>(LogicI->getOperand(1))) + OpTy = CI0->getOperand(0)->getType(); + + if (OpTy != nullptr) { + if (VF == 1) { + assert (!OpTy->isVectorTy() && "Expected scalar type"); + return OpTy; + } + // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may + // be either scalar or already vectorized with a same or lesser VF. + Type *ElTy = OpTy->getScalarType(); + return VectorType::get(ElTy, VF); + } + + return nullptr; +} + +int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { + unsigned DstScalarBits = Dst->getScalarSizeInBits(); + unsigned SrcScalarBits = Src->getScalarSizeInBits(); + + if (Src->isVectorTy()) { + assert (ST->hasVector() && "getCastInstrCost() called with vector type."); + assert (Dst->isVectorTy()); + unsigned VF = Src->getVectorNumElements(); + unsigned NumDstVectors = getNumberOfParts(Dst); + unsigned NumSrcVectors = getNumberOfParts(Src); + + if (Opcode == Instruction::Trunc) { + if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) + return 0; // Check for NOOP conversions. + return getVectorTruncCost(Src, Dst); + } + + if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { + if (SrcScalarBits >= 8) { + // ZExt/SExt will be handled with one unpack per doubling of width. + unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); + + // For types that spans multiple vector registers, some additional + // instructions are used to setup the unpacking. + unsigned NumSrcVectorOps = + (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) + : (NumDstVectors / 2)); + + return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; + } + else if (SrcScalarBits == 1) { + // This should be extension of a compare i1 result. + // If we know what the widths of the compared operands, get the + // cost of converting it to Dst. Otherwise assume same widths. + unsigned Cost = 0; + Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); + if (CmpOpTy != nullptr) + Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); + if (Opcode == Instruction::ZExt) + // One 'vn' per dst vector with an immediate mask. + Cost += NumDstVectors; + return Cost; + } + } + + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || + Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { + // TODO: Fix base implementation which could simplify things a bit here + // (seems to miss on differentiating on scalar/vector types). + + // Only 64 bit vector conversions are natively supported. + if (SrcScalarBits == 64 && DstScalarBits == 64) + return NumDstVectors; + + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. Base implementation does not + // realize float->int gets scalarized. + unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(), + Src->getScalarType()); + unsigned TotCost = VF * ScalarCost; + bool NeedsInserts = true, NeedsExtracts = true; + // FP128 registers do not get inserted or extracted. + if (DstScalarBits == 128 && + (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) + NeedsInserts = false; + if (SrcScalarBits == 128 && + (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) + NeedsExtracts = false; + + TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts); + + // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. + if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) + TotCost *= 2; + + return TotCost; + } + + if (Opcode == Instruction::FPTrunc) { + if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. + return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false); + else // double -> float + return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); + } + + if (Opcode == Instruction::FPExt) { + if (SrcScalarBits == 32 && DstScalarBits == 64) { + // float -> double is very rare and currently unoptimized. Instead of + // using vldeb, which can do two at a time, all conversions are + // scalarized. + return VF * 2; + } + // -> fp128. VF * lxdb/lxeb + extraction of elements. + return VF + getScalarizationOverhead(Src, false, true); + } + } + else { // Scalar + assert (!Dst->isVectorTy()); + + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) + return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/); + + if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && + Src->isIntegerTy(1)) { + // This should be extension of a compare i1 result, which is done with + // ipm and a varying sequence of instructions. + unsigned Cost = 0; + if (Opcode == Instruction::SExt) + Cost = (DstScalarBits < 64 ? 3 : 4); + if (Opcode == Instruction::ZExt) + Cost = 3; + Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); + if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) + // If operands of an fp-type was compared, this costs +1. + Cost++; + + return Cost; + } + } + + return BaseT::getCastInstrCost(Opcode, Dst, Src, I); +} + +int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { + if (ValTy->isVectorTy()) { + assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type."); + assert (CondTy == nullptr || CondTy->isVectorTy()); + unsigned VF = ValTy->getVectorNumElements(); + + // Called with a compare instruction. + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { + unsigned PredicateExtraCost = 0; + if (I != nullptr) { + // Some predicates cost one or two extra instructions. + switch (dyn_cast<CmpInst>(I)->getPredicate()) { + case CmpInst::Predicate::ICMP_NE: + case CmpInst::Predicate::ICMP_UGE: + case CmpInst::Predicate::ICMP_ULE: + case CmpInst::Predicate::ICMP_SGE: + case CmpInst::Predicate::ICMP_SLE: + PredicateExtraCost = 1; + break; + case CmpInst::Predicate::FCMP_ONE: + case CmpInst::Predicate::FCMP_ORD: + case CmpInst::Predicate::FCMP_UEQ: + case CmpInst::Predicate::FCMP_UNO: + PredicateExtraCost = 2; + break; + default: + break; + } + } + + // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of + // floats. FIXME: <2 x float> generates same code as <4 x float>. + unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); + unsigned NumVecs_cmp = getNumberOfParts(ValTy); + + unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); + return Cost; + } + else { // Called with a select instruction. + assert (Opcode == Instruction::Select); + + // We can figure out the extra cost of packing / unpacking if the + // instruction was passed and the compare instruction is found. + unsigned PackCost = 0; + Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); + if (CmpOpTy != nullptr) + PackCost = + getVectorBitmaskConversionCost(CmpOpTy, ValTy); + + return getNumberOfParts(ValTy) /*vsel*/ + PackCost; + } + } + else { // Scalar + switch (Opcode) { + case Instruction::ICmp: { + unsigned Cost = 1; + if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) + Cost += 2; // extend both operands + return Cost; + } + case Instruction::Select: + if (ValTy->isFloatingPointTy()) + return 4; // No load on condition for FP, so this costs a conditional jump. + return 1; // Load On Condition. + } + } + + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr); +} + +int SystemZTTIImpl:: +getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + // vlvgp will insert two grs into a vector register, so only count half the + // number of instructions. + if (Opcode == Instruction::InsertElement && + Val->getScalarType()->isIntegerTy(64)) + return ((Index % 2 == 0) ? 1 : 0); + + if (Opcode == Instruction::ExtractElement) { + int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1); + + // Give a slight penalty for moving out of vector pipeline to FXU unit. + if (Index == 0 && Val->getScalarType()->isIntegerTy()) + Cost += 1; + + return Cost; + } + + return BaseT::getVectorInstrCost(Opcode, Val, Index); +} + +int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace, + const Instruction *I) { + assert(!Src->isVoidTy() && "Invalid type"); + + if (!Src->isVectorTy() && Opcode == Instruction::Load && + I != nullptr && I->hasOneUse()) { + const Instruction *UserI = cast<Instruction>(*I->user_begin()); + unsigned Bits = Src->getScalarSizeInBits(); + bool FoldsLoad = false; + switch (UserI->getOpcode()) { + case Instruction::ICmp: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // This also makes sense for float operations, but disabled for now due + // to regressions. + // case Instruction::FCmp: + // case Instruction::FAdd: + // case Instruction::FSub: + // case Instruction::FMul: + // case Instruction::FDiv: + FoldsLoad = (Bits == 32 || Bits == 64); + break; + } + + if (FoldsLoad) { + assert (UserI->getNumOperands() == 2 && + "Expected to only handle binops."); + + // UserI can't fold two loads, so in that case return 0 cost only + // half of the time. + for (unsigned i = 0; i < 2; ++i) { + if (UserI->getOperand(i) == I) + continue; + if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) { + if (LI->hasOneUse()) + return i == 0; + } + } + + return 0; + } + } + + unsigned NumOps = getNumberOfParts(Src); + + if (Src->getScalarSizeInBits() == 128) + // 128 bit scalars are held in a pair of two 64 bit registers. + NumOps *= 2; + + return NumOps; +} + +int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa<VectorType>(VecTy) && + "Expect a vector type for interleaved memory op"); + + unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ? + (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits()); + assert (WideBits > 0 && "Could not compute size of vector"); + int NumWideParts = + ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); + + // How many source vectors are handled to produce a vectorized operand? + int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts); + int NumSrcParts = + ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts); + + // A Load group may have gaps. + unsigned NumOperands = + ((Opcode == Instruction::Load) ? Indices.size() : Factor); + + // Each needed permute takes two vectors as input. + if (NumSrcParts > 1) + NumSrcParts--; + int NumPermutes = NumSrcParts * NumOperands; + + // Cost of load/store operations and the permutations needed. + return NumWideParts + NumPermutes; +} diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index f7d2d827f11..d2639cb271d 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> { const SystemZSubtarget *getST() const { return ST; } const SystemZTargetLowering *getTLI() const { return TLI; } + unsigned const LIBCALL_COST = 30; + public: explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -53,6 +55,31 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool enableInterleavedAccessVectorization() { return true; } + + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); + unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace, const Instruction *I = nullptr); + + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ea8aa5cb61e..b742fb47237 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -938,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -1304,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1370,7 +1372,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, @@ -1615,7 +1617,7 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { } int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 2aa94fdc3c2..9bef9e80c39 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -62,11 +62,13 @@ public: TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 7223370a717..f344eb15146 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1882,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, "non noop cast is found during rematerialization"); Type *SrcTy = CI->getOperand(0)->getType(); - Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy); + Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { // Cost of the address calculation diff --git a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 9e97d9a65c8..c83b3f7b225 100644 --- a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -550,7 +550,8 @@ namespace { TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue) { + TargetTransformInfo::OK_AnyValue, + const Instruction *I = nullptr) { switch (Opcode) { default: break; case Instruction::GetElementPtr: @@ -584,7 +585,7 @@ namespace { case Instruction::Select: case Instruction::ICmp: case Instruction::FCmp: - return TTI->getCmpSelInstrCost(Opcode, T1, T2); + return TTI->getCmpSelInstrCost(Opcode, T1, T2, I); case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -598,7 +599,7 @@ namespace { case Instruction::FPTrunc: case Instruction::BitCast: case Instruction::ShuffleVector: - return TTI->getCastInstrCost(Opcode, T1, T2); + return TTI->getCastInstrCost(Opcode, T1, T2, I); } return 1; @@ -1044,14 +1045,14 @@ namespace { return false; } } else if (TTI) { - unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); - unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); - Type *VT1 = getVecTypeForPair(IT1, JT1), - *VT2 = getVecTypeForPair(IT2, JT2); TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); // On some targets (example X86) the cost of a vector shift may vary // depending on whether the second operand is a Uniform or @@ -1090,7 +1091,7 @@ namespace { // but this cost is ignored (because insert and extract element // instructions are assigned a zero depth factor and are not really // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I); if (VCost > ICost + JCost) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fff1e29ecbb..f891cd9d62c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7048,7 +7048,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS); + AS, I); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -7078,7 +7078,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -7154,7 +7154,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, unsigned AS = getMemInstAlignment(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); } return getWideningCost(I, VF); } @@ -7369,7 +7369,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -7378,7 +7378,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); } case Instruction::Store: case Instruction::Load: { @@ -7403,7 +7403,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast<TruncInst>(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy()); + Trunc->getSrcTy(), Trunc); } Type *SrcScalarTy = I->getOperand(0)->getType(); @@ -7427,7 +7427,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); } case Instruction::Call: { bool NeedToScalarize; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b9df89e3eec..df7dc2ac67f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1762,10 +1762,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); + VL0->getType(), SrcTy, VL0); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -1774,8 +1774,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -1858,18 +1858,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecStCost - ScalarStCost; } case Instruction::Call: { |