diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/TargetTransformInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 59 |
8 files changed, 51 insertions, 55 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 2a15b9b264e..cd8c24630df 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { } int TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + ScalarEvolution *SE, + const SCEV *Ptr) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 88c98865bbc..1a17691fc58 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -417,14 +417,17 @@ int AArch64TTIImpl::getArithmeticInstrCost( } } -int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 24642cb1698..849fd3d9b44 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -104,7 +104,7 @@ public: TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - int getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 10e6297ef1e..cc001b59678 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -338,14 +338,17 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index d83228afb0a..731a5adf3d7 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -104,7 +104,8 @@ public: int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, ScalarEvolution *SE, + const SCEV *Ptr); int getFPOpCost(Type *Ty); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 84ec14285d9..c2f542fd4b1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1626,17 +1626,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; + // Cost modeling of Strided Access Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont believe that there + // is a difference between constant strided access in gerenal and constant + // strided value which is less than or equal to 64. + // Even in the case of (loop invariant) stride whose value is not known at + // compile time, the address computation will not incur more than one extra + // ADD instruction. + if (Ty->isVectorTy() && SE) { + if (!BaseT::isStridedAccess(Ptr)) + return NumVectorInstToHideOverhead; + if (!BaseT::getConstantStrideStep(SE, Ptr)) + return 1; + } - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, SE, Ptr); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index f6bcb9f569e..c013805f432 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -71,7 +71,8 @@ public: unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8cde0c4cd60..31daba2248a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6785,22 +6785,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access SCEV after verifying that the access pattern +/// is loop invariant except the induction variable dependence. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This SCEV can be sent to the Target in order to estimate the address +/// calculation cost. +static const SCEV *getAddressAccessSCEV( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); if (!Gep) - return true; + return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. @@ -6809,33 +6806,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return nullptr; } - // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step - // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; - - const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); - if (!AddRec) - return true; - - // Check the step is constant. - const SCEV *Step = AddRec->getStepRecurrence(*SE); - // Calculate the pointer stride and check if it is consecutive. - const auto *C = dyn_cast<SCEVConstant>(Step); - if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; - - int64_t StepVal = APStepVal.getSExtValue(); - - return StepVal > MaxMergeDistance; + // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. + return SE->getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7063,12 +7038,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); |