4 files changed, 87 insertions, 21 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index cac14dbaa59..8dcddd25429 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -305,6 +305,15 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
 def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
                                               "Prefer 32-bit alignment for loops">;
 
+def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1",
+                        "Model MVE instructions as a 1 beat per tick architecture">;
+
+def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2",
+                        "Model MVE instructions as a 2 beats per tick architecture">;
+
+def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4",
+                        "Model MVE instructions as a 4 beats per tick architecture">;
+
 /// Some instructions update CPSR partially, which can add false dependency for
 /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
 /// mapped to a separate physical register. Avoid partial CPSR update for these
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 1773d17e1ac..54443d2126f 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isRWPI())
     ReserveR9 = true;
 
+  // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
+  if (MVEVectorCostFactor == 0)
+    MVEVectorCostFactor = 2;
+
   // FIXME: Teach TableGen to deal with these instead of doing it manually here.
   switch (ARMProcFamily) {
   case Others:
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 637501214b1..dde9dcbdb1c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -472,6 +472,11 @@ protected:
   /// What alignment is preferred for loop bodies, in log2(bytes).
   unsigned PrefLoopAlignment = 0;
 
+  /// The cost factor for MVE instructions, representing the multiple beats an
+  // instruction can take. The default is 2, (set in initSubtargetFeatures so
+  // that we can use subtarget features less than 2).
+  unsigned MVEVectorCostFactor = 0;
+
   /// OptMinSize - True if we're optimising for minimum code size, equal to
   /// the function attribute.
   bool OptMinSize = false;
@@ -858,6 +863,8 @@ public:
     return PrefLoopAlignment;
   }
 
+  unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; }
+
   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
                                    unsigned PhysReg) const override;
   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b1868cd833c..a8d04eef8c5 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -353,7 +353,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Entry->Cost;
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -378,6 +381,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
   }
 
+  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
+                                 Opcode == Instruction::ExtractElement)) {
+    // We say MVE moves costs at least the MVEVectorCostFactor, even though
+    // they are scalar instructions. This helps prevent mixing scalar and
+    // vector, to prevent vectorising where we end up just scalarising the
+    // result anyway.
+    return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
+                    ST->getMVEVectorCostFactor()) *
+           ValTy->getVectorNumElements() / 2;
+  }
+
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
@@ -406,7 +420,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -549,10 +566,13 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                               LT.second))
-        return LT.first * Entry->Cost;
+        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
     }
   }
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 int ARMTTIImpl::getArithmeticInstrCost(
@@ -606,25 +626,48 @@ int ARMTTIImpl::getArithmeticInstrCost(
     // Multiplication.
   };
 
-  if (ST->hasNEON())
+  if (ST->hasNEON()) {
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
-  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                           Opd1PropInfo, Opd2PropInfo);
-
-  // This is somewhat of a hack. The problem that we are facing is that SROA
-  // creates a sequence of shift, and, or instructions to construct values.
-  // These sequences are recognized by the ISel and have zero-cost. Not so for
-  // the vectorized code. Because we have support for v2i64 but not i64 those
-  // sequences look particularly beneficial to vectorize.
-  // To work around this we increase the cost of v2i64 operations to make them
-  // seem less beneficial.
-  if (LT.second == MVT::v2i64 &&
-      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
-    Cost += 4;
-
-  return Cost;
+    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                             Opd1PropInfo, Opd2PropInfo);
+
+    // This is somewhat of a hack. The problem that we are facing is that SROA
+    // creates a sequence of shift, and, or instructions to construct values.
+    // These sequences are recognized by the ISel and have zero-cost. Not so for
+    // the vectorized code. Because we have support for v2i64 but not i64 those
+    // sequences look particularly beneficial to vectorize.
+    // To work around this we increase the cost of v2i64 operations to make them
+    // seem less beneficial.
+    if (LT.second == MVT::v2i64 &&
+        Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+      Cost += 4;
+
+    return Cost;
+  }
+
+  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+
+  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
+  // without treating floats as more expensive that scalars or increasing the
+  // costs for custom operations. The results is also multiplied by the
+  // MVEVectorCostFactor where appropriate.
+  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
+    return LT.first * BaseCost;
+
+  // Else this is expand, assume that we need to scalarize this op.
+  if (Ty->isVectorTy()) {
+    unsigned Num = Ty->getVectorNumElements();
+    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+    // Return the cost of multiple scalar invocation plus the cost of
+    // inserting and extracting the values.
+    return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+  }
+
+  return BaseCost;
 }
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@@ -637,7 +680,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
     return LT.first * 4;
   }
-  return LT.first;
+  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * LT.first;
 }
 
 int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,