10 files changed, 224 insertions, 115 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index ca382a89692..67a0556e437 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
                                          "Reserve X18, making it unavailable "
                                          "as a GPR">;
 
+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
+                                            "MergeNarrowLoads", "true",
+                                            "Merge narrow load instructions">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+    "true",
+    "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+    "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+    "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+    "CustomAsCheapAsMove", "true",
+    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+    "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+    "AvoidQuadLdStPairs", "true",
+    "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+    "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+    "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureMacroOpFusion : SubtargetFeature<
+    "macroop-fusion", "HasMacroOpFusion", "true",
+    "CPU supports macro op fusion">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+    "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+    "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+    "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -94,57 +138,87 @@ include "AArch64SchedM1.td"
 include "AArch64SchedKryo.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
-                                   "Cortex-A35 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A35 ARM processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A53 ARM processors", [
+                                   FeatureBalanceFPOps,
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeatureUseAA
+                                   ]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A57 ARM processors", [
+                                   FeatureBalanceFPOps,
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive
+                                   ]>;
 
 def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
-                                   "Cyclone",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
+                                   "Cyclone", [
+                                   FeatureAlternateSExtLoadCVTF32Pattern,
                                    FeatureCrypto,
+                                   FeatureDisableLatencySchedHeuristic,
+                                   FeatureFPARMv8,
+                                   FeatureMacroOpFusion,
+                                   FeatureNEON,
                                    FeaturePerfMon,
-                                   FeatureZCRegMove, FeatureZCZeroing]>;
+                                   FeatureSlowMisaligned128Store,
+                                   FeatureZCRegMove,
+                                   FeatureZCZeroing
+                                   ]>;
 
 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
-                                    "Samsung Exynos-M1 processors",
-                                    [FeatureFPARMv8,
-                                    FeatureNEON,
-                                    FeatureCrypto,
+                                    "Samsung Exynos-M1 processors", [
+                                    FeatureAvoidQuadLdStPairs,
                                     FeatureCRC,
-                                    FeaturePerfMon]>;
+                                    FeatureCrypto,
+                                    FeatureCustomCheapAsMoveHandling,
+                                    FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeaturePerfMon,
+                                    FeatureUseRSqrt
+                                    ]>;
 
 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
-                                   "Qualcomm Kryo processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Qualcomm Kryo processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
-
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
-                                              FeatureNEON,
-                                              FeatureCRC,
-                                              FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive
+                                   ]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [
+                     FeatureCRC,
+                     FeatureFPARMv8,
+                     FeatureNEON,
+                     FeaturePerfMon,
+                     FeaturePostRAScheduler
+                     ]>;
 
 // FIXME: Cortex-A35 is currently modelled as a Cortex-A53
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
diff --git a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index b8f6adba0c1..71a9c7e8b87 100644
--- a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
   if (skipFunction(*F.getFunction()))
     return false;
 
-  // Don't do anything if this isn't an A53 or A57.
-  if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
-        F.getSubtarget<AArch64Subtarget>().isCortexA57()))
+  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
     return false;
 
   bool Changed = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c6c47ab148b..8d9124670b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
-  // Prefer likely predicted branches to selects on out-of-order cores.
-  if (Subtarget->isCortexA57() || Subtarget->isKryo())
-    PredictableSelectIsExpensive = true;
+  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
@@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (Subtarget->requiresStrictAlign())
     return false;
 
-  // FIXME: This is mostly true for Cyclone, but not necessarily others.
   if (Fast) {
-    // FIXME: Define an attribute for slow unaligned accesses instead of
-    // relying on the CPU type as a proxy.
-    // On Cyclone, unaligned 128-bit stores are slow.
-    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+    // Some CPUs are fine with unaligned stores except for 128-bit ones.
+    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
             // See comments in performSTORECombine() for more details about
             // these conditions.
 
@@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
   // a call to that function here.
 
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundaries. We want to split such stores.
-  if (!Subtarget->isCyclone())
+  if (!Subtarget->isMisaligned128StoreSlow())
     return SDValue();
 
   // Don't split at -Oz.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1e4373b6294..d0d08083e28 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
-      !Subtarget.isExynosM1() && !Subtarget.isKryo())
+  if (!Subtarget.hasCustomCheapAsMoveHandling())
     return MI->isAsCheapAsAMove();
 
   unsigned Imm;
@@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ADDXri:
   case AArch64::SUBWri:
   case AArch64::SUBXri:
-    return (Subtarget.isExynosM1() ||
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
             MI->getOperand(3).getImm() == 0);
 
   // add/sub on register with shift
@@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::SUBWrs:
   case AArch64::SUBXrs:
     Imm = MI->getOperand(3).getImm();
-    return (Subtarget.isExynosM1() &&
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
             AArch64_AM::getArithShiftValue(Imm) < 4);
 
   // logical ops on immediate
@@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ORRWrs:
   case AArch64::ORRXrs:
     Imm = MI->getOperand(3).getImm();
-    return (Subtarget.isExynosM1() &&
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
             AArch64_AM::getShiftValue(Imm) < 4 &&
             AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
 
@@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
   if (isLdStPairSuppressed(MI))
     return false;
 
-  // Do not pair quad ld/st for Exynos.
-  if (Subtarget.isExynosM1()) {
+  // On some CPUs quad load/store pairs are slower than two single load/stores.
+  if (Subtarget.avoidQuadLdStPairs()) {
     switch (MI->getOpcode()) {
     default:
       break;
@@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
 
 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
                                               MachineInstr *Second) const {
-  if (Subtarget.isCyclone()) {
-    // Cyclone can fuse CMN, CMP, TST followed by Bcc.
+  if (Subtarget.hasMacroOpFusion()) {
+    // Fuse CMN, CMP, TST followed by Bcc.
     unsigned SecondOpcode = Second->getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
       switch (First->getOpcode()) {
@@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
         return true;
       }
     }
-    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+    // Fuse ALU operations followed by CBZ/CBNZ.
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
       switch (First->getOpcode()) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9e3954905ce..d79d603e1c6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -34,7 +34,8 @@ def HasSPE           : Predicate<"Subtarget->hasSPE()">,
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
-def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
+def UseAlternateSExtLoadCVTF32
+    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
                                     0),
                                   dsub)),
                                0),
-                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             ssub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                      0),
                                    dsub)),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             dsub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
  
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 49dc7d6a437..8c44fa615ee 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and promote load instructions which read directly from store.
   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
 
-  // Check if converting two narrow loads into a single wider load with
-  // bitfield extracts could be enabled.
-  bool enableNarrowLdMerge(MachineFunction &Fn);
-
   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   return Modified;
 }
 
-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
-  bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
-  // FIXME: The benefit from converting narrow loads into a wider load could be
-  // microarchitectural as it assumes that a single load with two bitfield
-  // extracts is cheaper than two narrow loads. Currently, this conversion is
-  // enabled only in cortex-a57 on which performance benefits were verified.
-  return ProfitableArch && !Subtarget->requiresStrictAlign();
-}
-
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(*Fn.getFunction()))
     return false;
@@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   UsedRegs.resize(TRI->getNumRegs());
 
   bool Modified = false;
-  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
+  bool enableNarrowLdOpt =
+    Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
   for (auto &MBB : Fn)
     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index bedbcf54c93..57957d1de53 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
     CPUString = "generic";
 
   ParseSubtargetFeatures(CPUString, FS);
+  initializeProperties();
+
   return *this;
 }
 
+void AArch64Subtarget::initializeProperties() {
+  // Initialize CPU specific properties. We should add a tablegen feature for
+  // this in the future so we can specify it together with the subtarget
+  // features.
+  switch (ARMProcFamily) {
+  case Cyclone:
+    CacheLineSize = 64;
+    PrefetchDistance = 280;
+    MinPrefetchStride = 2048;
+    MaxPrefetchIterationsAhead = 3;
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    break;
+  case Kryo:
+    MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
+    break;
+  case Others: break;
+  case CortexA35: break;
+  case CortexA53: break;
+  case ExynosM1: break;
+  }
+}
+
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
@@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   // Enabling or Disabling the latency heuristic is a close call: It seems to
   // help nearly no benchmark on out-of-order architectures, on the other hand
   // it regresses register pressure on a few benchmarking.
-  if (isCyclone())
-    Policy.DisableLatencyHeuristic = true;
+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 }
 
 bool AArch64Subtarget::enableEarlyIfConversion() const {
@@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
-  if (!isCortexA57())
-    return nullptr;
-
-  return llvm::make_unique<A57ChainingConstraint>();
+  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index a920deea886..e4e88b35ccb 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,8 +33,8 @@ class StringRef;
 class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-protected:
-  enum ARMProcFamilyEnum {
+public:
+  enum ARMProcFamilyEnum : uint8_t {
     Others,
     CortexA35,
     CortexA53,
@@ -44,6 +44,7 @@ protected:
     Kryo
   };
 
+protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily = Others;
 
@@ -66,6 +67,24 @@ protected:
 
   // StrictAlign - Disallow unaligned memory accesses.
   bool StrictAlign = false;
+  bool MergeNarrowLoads = false;
+  bool UseAA = false;
+  bool PredictableSelectIsExpensive = false;
+  bool BalanceFPOps = false;
+  bool CustomAsCheapAsMove = false;
+  bool UsePostRAScheduler = false;
+  bool Misaligned128StoreIsSlow = false;
+  bool AvoidQuadLdStPairs = false;
+  bool UseAlternateSExtLoadCVTF32Pattern = false;
+  bool HasMacroOpFusion = false;
+  bool DisableLatencySchedHeuristic = false;
+  bool UseRSqrt = false;
+  uint8_t MaxInterleaveFactor = 2;
+  uint8_t VectorInsertExtractBaseCost = 3;
+  uint16_t CacheLineSize = 0;
+  uint16_t PrefetchDistance = 0;
+  uint16_t MinPrefetchStride = 1;
+  unsigned MaxPrefetchIterationsAhead = UINT_MAX;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -93,6 +112,9 @@ private:
   /// subtarget initialization.
   AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
 
+  /// Initialize properties based on the selected processor family.
+  void initializeProperties();
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -123,7 +145,15 @@ public:
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override {
-    return isGeneric() || isCortexA53() || isCortexA57() || isKryo();
+    return UsePostRAScheduler;
+  }
+
+  /// Returns ARM processor family.
+  /// Avoid this function! CPU specifics should be kept local to this class
+  /// and preferably modeled with SubtargetFeatures or properties in
+  /// initializeProperties().
+  ARMProcFamilyEnum getProcFamily() const {
+    return ARMProcFamily;
   }
 
   bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -140,6 +170,30 @@ public:
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool mergeNarrowLoads() const { return MergeNarrowLoads; }
+  bool balanceFPOps() const { return BalanceFPOps; }
+  bool predictableSelectIsExpensive() const {
+    return PredictableSelectIsExpensive;
+  }
+  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+  bool useAlternateSExtLoadCVTF32Pattern() const {
+    return UseAlternateSExtLoadCVTF32Pattern;
+  }
+  bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+  bool useRSqrt() const { return UseRSqrt; }
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+  unsigned getVectorInsertExtractBaseCost() const {
+    return VectorInsertExtractBaseCost;
+  }
+  unsigned getCacheLineSize() const { return CacheLineSize; }
+  unsigned getPrefetchDistance() const { return PrefetchDistance; }
+  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+  unsigned getMaxPrefetchIterationsAhead() const {
+    return MaxPrefetchIterationsAhead;
+  }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -160,14 +214,7 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  bool isGeneric() const { return CPUString == "generic"; }
-  bool isCyclone() const { return CPUString == "cyclone"; }
-  bool isCortexA57() const { return CPUString == "cortex-a57"; }
-  bool isCortexA53() const { return CPUString == "cortex-a53"; }
-  bool isExynosM1() const { return CPUString == "exynos-m1"; }
-  bool isKryo() const { return CPUString == "kryo"; }
-
-  bool useAA() const override { return isCortexA53(); }
+  bool useAA() const override { return UseAA; }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 8633d87dbd1..c9ca394cbed 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
   // (52 mantissa bits) are 2 and 3, respectively.
   unsigned ExtraStepsF = 2,
            ExtraStepsD = ExtraStepsF + 1;
-  // FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
-  bool UseRsqrt = ST.isExynosM1();
+  bool UseRsqrt = ST.useRSqrt();
 
   TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
   TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1684d2f769d..ecf4d93068a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   }
 
   // All other insert/extracts cost this much.
-  if (ST->isKryo())
-    return 2;
-  return 3;
+  return ST->getVectorInsertExtractBaseCost();
 }
 
 int AArch64TTIImpl::getArithmeticInstrCost(
@@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 }
 
 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  if (ST->isCortexA57() || ST->isKryo())
-    return 4;
-  return 2;
+  return ST->getMaxInterleaveFactor();
 }
 
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 }
 
 unsigned AArch64TTIImpl::getCacheLineSize() {
-  if (ST->isCyclone())
-    return 64;
-  return BaseT::getCacheLineSize();
+  return ST->getCacheLineSize();
 }
 
 unsigned AArch64TTIImpl::getPrefetchDistance() {
-  if (ST->isCyclone())
-    return 280;
-  return BaseT::getPrefetchDistance();
+  return ST->getPrefetchDistance();
 }
 
 unsigned AArch64TTIImpl::getMinPrefetchStride() {
-  if (ST->isCyclone())
-    // The HW prefetcher handles accesses with strides up to 2KB.
-    return 2048;
-  return BaseT::getMinPrefetchStride();
+  return ST->getMinPrefetchStride();
 }
 
 unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
-  if (ST->isCyclone())
-    // Be conservative for now and don't prefetch ahead too much since the loop
-    // may terminate early.
-    return 3;
-  return BaseT::getMaxPrefetchIterationsAhead();
+  return ST->getMaxPrefetchIterationsAhead();
 }