diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/ARM/ARM.td | 9 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 88 |
4 files changed, 87 insertions, 21 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index cac14dbaa59..8dcddd25429 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -305,6 +305,15 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2", "Prefer 32-bit alignment for loops">; +def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1", + "Model MVE instructions as a 1 beat per tick architecture">; + +def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2", + "Model MVE instructions as a 2 beats per tick architecture">; + +def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4", + "Model MVE instructions as a 4 beats per tick architecture">; + /// Some instructions update CPSR partially, which can add false dependency for /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is /// mapped to a separate physical register. Avoid partial CPSR update for these diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 1773d17e1ac..54443d2126f 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isRWPI()) ReserveR9 = true; + // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2 + if (MVEVectorCostFactor == 0) + MVEVectorCostFactor = 2; + // FIXME: Teach TableGen to deal with these instead of doing it manually here. switch (ARMProcFamily) { case Others: diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 637501214b1..dde9dcbdb1c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -472,6 +472,11 @@ protected: /// What alignment is preferred for loop bodies, in log2(bytes). unsigned PrefLoopAlignment = 0; + /// The cost factor for MVE instructions, representing the multiple beats an + // instruction can take. The default is 2, (set in initSubtargetFeatures so + // that we can use subtarget features less than 2). + unsigned MVEVectorCostFactor = 0; + /// OptMinSize - True if we're optimising for minimum code size, equal to /// the function attribute. bool OptMinSize = false; @@ -858,6 +863,8 @@ public: return PrefLoopAlignment; } + unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; } + bool ignoreCSRForAllocationOrder(const MachineFunction &MF, unsigned PhysReg) const override; unsigned getGPRAllocationOrder(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index b1868cd833c..a8d04eef8c5 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -353,7 +353,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -378,6 +381,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); } + if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { + // We say MVE moves costs at least the MVEVectorCostFactor, even though + // they are scalar instructions. This helps prevent mixing scalar and + // vector, to prevent vectorising where we end up just scalarising the + // result anyway. + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), + ST->getMVEVectorCostFactor()) * + ValTy->getVectorNumElements() / 2; + } + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -406,7 +420,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, return LT.first; } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -549,10 +566,13 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost; + return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } int ARMTTIImpl::getArithmeticInstrCost( @@ -606,25 +626,48 @@ int ARMTTIImpl::getArithmeticInstrCost( // Multiplication. }; - if (ST->hasNEON()) + if (ST->hasNEON()) { if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; - int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); - - // This is somewhat of a hack. The problem that we are facing is that SROA - // creates a sequence of shift, and, or instructions to construct values. - // These sequences are recognized by the ISel and have zero-cost. Not so for - // the vectorized code. Because we have support for v2i64 but not i64 those - // sequences look particularly beneficial to vectorize. - // To work around this we increase the cost of v2i64 operations to make them - // seem less beneficial. - if (LT.second == MVT::v2i64 && - Op2Info == TargetTransformInfo::OK_UniformConstantValue) - Cost += 4; - - return Cost; + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + + // This is somewhat of a hack. The problem that we are facing is that SROA + // creates a sequence of shift, and, or instructions to construct values. + // These sequences are recognized by the ISel and have zero-cost. Not so for + // the vectorized code. Because we have support for v2i64 but not i64 those + // sequences look particularly beneficial to vectorize. + // To work around this we increase the cost of v2i64 operations to make them + // seem less beneficial. + if (LT.second == MVT::v2i64 && + Op2Info == TargetTransformInfo::OK_UniformConstantValue) + Cost += 4; + + return Cost; + } + + int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + + // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, + // without treating floats as more expensive that scalars or increasing the + // costs for custom operations. The results is also multiplied by the + // MVEVectorCostFactor where appropriate. + if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) + return LT.first * BaseCost; + + // Else this is expand, assume that we need to scalarize this op. + if (Ty->isVectorTy()) { + unsigned Num = Ty->getVectorNumElements(); + unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; + } + + return BaseCost; } int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, @@ -637,7 +680,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. return LT.first * 4; } - return LT.first; + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * LT.first; } int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, |