diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64.td | 142 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64Subtarget.h | 69 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 27 |
10 files changed, 224 insertions, 115 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index ca382a89692..67a0556e437 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", "Reserve X18, making it unavailable " "as a GPR">; +def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld", + "MergeNarrowLoads", "true", + "Merge narrow load instructions">; + +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + +def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", + "true", + "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">; + +def FeaturePredictableSelectIsExpensive : SubtargetFeature< + "predictable-select-expensive", "PredictableSelectIsExpensive", "true", + "Prefer likely predicted branches over selects">; + +def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", + "CustomAsCheapAsMove", "true", + "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + +def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", + "UsePostRAScheduler", "true", "Schedule again after register allocation">; + +def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", + "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + +def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs", + "AvoidQuadLdStPairs", "true", + "Do not form quad load/store pair operations">; + +def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< + "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", + "true", "Use alternative pattern for sextload convert to f32">; + +def FeatureMacroOpFusion : SubtargetFeature< + "macroop-fusion", "HasMacroOpFusion", "true", + "CPU supports macro op fusion">; + +def FeatureDisableLatencySchedHeuristic : SubtargetFeature< + "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", + "Disable latency scheduling heuristic">; + +def FeatureUseRSqrt : SubtargetFeature< + "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -94,57 +138,87 @@ include "AArch64SchedM1.td" include "AArch64SchedKryo.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", - "Cortex-A35 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A35 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A53 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseAA + ]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A57 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", - [FeatureFPARMv8, - FeatureNEON, + "Cyclone", [ + FeatureAlternateSExtLoadCVTF32Pattern, FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, FeaturePerfMon, - FeatureZCRegMove, FeatureZCZeroing]>; + FeatureSlowMisaligned128Store, + FeatureZCRegMove, + FeatureZCZeroing + ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Samsung Exynos-M1 processors", [ + FeatureAvoidQuadLdStPairs, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeatureUseRSqrt + ]>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", - "Qualcomm Kryo processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Qualcomm Kryo processors", [ FeatureCRC, - FeaturePerfMon]>; - -def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, - FeatureNEON, - FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; + +def : ProcessorModel<"generic", NoSchedModel, [ + FeatureCRC, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler + ]>; // FIXME: Cortex-A35 is currently modelled as a Cortex-A53 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; diff --git a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index b8f6adba0c1..71a9c7e8b87 100644 --- a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { if (skipFunction(*F.getFunction())) return false; - // Don't do anything if this isn't an A53 or A57. - if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() || - F.getSubtarget<AArch64Subtarget>().isCortexA57())) + if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps()) return false; bool Changed = false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c6c47ab148b..8d9124670b0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // Prefer likely predicted branches to selects on out-of-order cores. - if (Subtarget->isCortexA57() || Subtarget->isKryo()) - PredictableSelectIsExpensive = true; + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { @@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (Subtarget->requiresStrictAlign()) return false; - // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { - // FIXME: Define an attribute for slow unaligned accesses instead of - // relying on the CPU type as a proxy. - // On Cyclone, unaligned 128-bit stores are slow. - *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. @@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundaries. We want to split such stores. - if (!Subtarget->isCyclone()) + if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 1e4373b6294..d0d08083e28 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { - if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() && - !Subtarget.isExynosM1() && !Subtarget.isKryo()) + if (!Subtarget.hasCustomCheapAsMoveHandling()) return MI->isAsCheapAsAMove(); unsigned Imm; @@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (Subtarget.isExynosM1() || + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || MI->getOperand(3).getImm() == 0); // add/sub on register with shift @@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::SUBWrs: case AArch64::SUBXrs: Imm = MI->getOperand(3).getImm(); - return (Subtarget.isExynosM1() && + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && AArch64_AM::getArithShiftValue(Imm) < 4); // logical ops on immediate @@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrs: case AArch64::ORRXrs: Imm = MI->getOperand(3).getImm(); - return (Subtarget.isExynosM1() && + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && AArch64_AM::getShiftValue(Imm) < 4 && AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); @@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const { if (isLdStPairSuppressed(MI)) return false; - // Do not pair quad ld/st for Exynos. - if (Subtarget.isExynosM1()) { + // On some CPUs quad load/store pairs are slower than two single load/stores. + if (Subtarget.avoidQuadLdStPairs()) { switch (MI->getOpcode()) { default: break; @@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *Second) const { - if (Subtarget.isCyclone()) { - // Cyclone can fuse CMN, CMP, TST followed by Bcc. + if (Subtarget.hasMacroOpFusion()) { + // Fuse CMN, CMP, TST followed by Bcc. unsigned SecondOpcode = Second->getOpcode(); if (SecondOpcode == AArch64::Bcc) { switch (First->getOpcode()) { @@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, return true; } } - // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + // Fuse ALU operations followed by CBZ/CBNZ. if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { switch (First->getOpcode()) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9e3954905ce..d79d603e1c6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -34,7 +34,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">, def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; -def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def UseAlternateSExtLoadCVTF32 + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 49dc7d6a437..8c44fa615ee 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - // Check if converting two narrow loads into a single wider load with - // bitfield extracts could be enabled. - bool enableNarrowLdMerge(MachineFunction &Fn); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, return Modified; } -bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { - bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo(); - // FIXME: The benefit from converting narrow loads into a wider load could be - // microarchitectural as it assumes that a single load with two bitfield - // extracts is cheaper than two narrow loads. Currently, this conversion is - // enabled only in cortex-a57 on which performance benefits were verified. - return ProfitableArch && !Subtarget->requiresStrictAlign(); -} - bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(*Fn.getFunction())) return false; @@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { UsedRegs.resize(TRI->getNumRegs()); bool Modified = false; - bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + bool enableNarrowLdOpt = + Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index bedbcf54c93..57957d1de53 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { CPUString = "generic"; ParseSubtargetFeatures(CPUString, FS); + initializeProperties(); + return *this; } +void AArch64Subtarget::initializeProperties() { + // Initialize CPU specific properties. We should add a tablegen feature for + // this in the future so we can specify it together with the subtarget + // features. + switch (ARMProcFamily) { + case Cyclone: + CacheLineSize = 64; + PrefetchDistance = 280; + MinPrefetchStride = 2048; + MaxPrefetchIterationsAhead = 3; + break; + case CortexA57: + MaxInterleaveFactor = 4; + break; + case Kryo: + MaxInterleaveFactor = 4; + VectorInsertExtractBaseCost = 2; + break; + case Others: break; + case CortexA35: break; + case CortexA53: break; + case ExynosM1: break; + } +} + AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) @@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // Enabling or Disabling the latency heuristic is a close call: It seems to // help nearly no benchmark on out-of-order architectures, on the other hand // it regresses register pressure on a few benchmarking. - if (isCyclone()) - Policy.DisableLatencyHeuristic = true; + Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; } bool AArch64Subtarget::enableEarlyIfConversion() const { @@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { - if (!isCortexA57()) - return nullptr; - - return llvm::make_unique<A57ChainingConstraint>(); + return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index a920deea886..e4e88b35ccb 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -33,8 +33,8 @@ class StringRef; class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum { +public: + enum ARMProcFamilyEnum : uint8_t { Others, CortexA35, CortexA53, @@ -44,6 +44,7 @@ protected: Kryo }; +protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily = Others; @@ -66,6 +67,24 @@ protected: // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; + bool MergeNarrowLoads = false; + bool UseAA = false; + bool PredictableSelectIsExpensive = false; + bool BalanceFPOps = false; + bool CustomAsCheapAsMove = false; + bool UsePostRAScheduler = false; + bool Misaligned128StoreIsSlow = false; + bool AvoidQuadLdStPairs = false; + bool UseAlternateSExtLoadCVTF32Pattern = false; + bool HasMacroOpFusion = false; + bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; + uint8_t MaxInterleaveFactor = 2; + uint8_t VectorInsertExtractBaseCost = 3; + uint16_t CacheLineSize = 0; + uint16_t PrefetchDistance = 0; + uint16_t MinPrefetchStride = 1; + unsigned MaxPrefetchIterationsAhead = UINT_MAX; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -93,6 +112,9 @@ private: /// subtarget initialization. AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + /// Initialize properties based on the selected processor family. + void initializeProperties(); + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -123,7 +145,15 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isGeneric() || isCortexA53() || isCortexA57() || isKryo(); + return UsePostRAScheduler; + } + + /// Returns ARM processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + ARMProcFamilyEnum getProcFamily() const { + return ARMProcFamily; } bool hasV8_1aOps() const { return HasV8_1aOps; } @@ -140,6 +170,30 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool balanceFPOps() const { return BalanceFPOps; } + bool predictableSelectIsExpensive() const { + return PredictableSelectIsExpensive; + } + bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } + bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } + bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; } + bool useAlternateSExtLoadCVTF32Pattern() const { + return UseAlternateSExtLoadCVTF32Pattern; + } + bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool useRSqrt() const { return UseRSqrt; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + unsigned getVectorInsertExtractBaseCost() const { + return VectorInsertExtractBaseCost; + } + unsigned getCacheLineSize() const { return CacheLineSize; } + unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getMinPrefetchStride() const { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const { + return MaxPrefetchIterationsAhead; + } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -160,14 +214,7 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isGeneric() const { return CPUString == "generic"; } - bool isCyclone() const { return CPUString == "cyclone"; } - bool isCortexA57() const { return CPUString == "cortex-a57"; } - bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool isExynosM1() const { return CPUString == "exynos-m1"; } - bool isKryo() const { return CPUString == "kryo"; } - - bool useAA() const override { return isCortexA53(); } + bool useAA() const override { return UseAA; } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 8633d87dbd1..c9ca394cbed 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST) // (52 mantissa bits) are 2 and 3, respectively. unsigned ExtraStepsF = 2, ExtraStepsD = ExtraStepsF + 1; - // FIXME: Enable x^-1/2 only for Exynos M1 at the moment. - bool UseRsqrt = ST.isExynosM1(); + bool UseRsqrt = ST.useRSqrt(); TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF); TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1684d2f769d..ecf4d93068a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - if (ST->isKryo()) - return 2; - return 3; + return ST->getVectorInsertExtractBaseCost(); } int AArch64TTIImpl::getArithmeticInstrCost( @@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { } unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { - if (ST->isCortexA57() || ST->isKryo()) - return 4; - return 2; + return ST->getMaxInterleaveFactor(); } void AArch64TTIImpl::getUnrollingPreferences(Loop *L, @@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } unsigned AArch64TTIImpl::getCacheLineSize() { - if (ST->isCyclone()) - return 64; - return BaseT::getCacheLineSize(); + return ST->getCacheLineSize(); } unsigned AArch64TTIImpl::getPrefetchDistance() { - if (ST->isCyclone()) - return 280; - return BaseT::getPrefetchDistance(); + return ST->getPrefetchDistance(); } unsigned AArch64TTIImpl::getMinPrefetchStride() { - if (ST->isCyclone()) - // The HW prefetcher handles accesses with strides up to 2KB. - return 2048; - return BaseT::getMinPrefetchStride(); + return ST->getMinPrefetchStride(); } unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { - if (ST->isCyclone()) - // Be conservative for now and don't prefetch ahead too much since the loop - // may terminate early. - return 3; - return BaseT::getMaxPrefetchIterationsAhead(); + return ST->getMaxPrefetchIterationsAhead(); } |