10 files changed, 52 insertions, 25 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 66bfd4c82e2..380eaa86368 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -303,6 +303,10 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
 def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
                                              "Disable VFP / NEON MAC instructions">;
 
+// VFPv4 added VFMA instructions that can similar be fast or slow.
+def FeatureHasSlowFPVFMx  : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true",
+                                             "Disable VFP / NEON FMA instructions">;
+
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
                                              "HasVMLxForwarding", "true",
@@ -588,6 +592,7 @@ def ProcExynos  : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos",
                                     FeatureHWDivThumb,
                                     FeatureHWDivARM,
                                     FeatureHasSlowFPVMLx,
+                                    FeatureHasSlowFPVFMx,
                                     FeatureHasRetAddrStack,
                                     FeatureFuseLiterals,
                                     FeatureFuseAES,
@@ -918,6 +923,7 @@ def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureVMLxForwarding,
                                                          FeatureMP,
                                                          FeatureVFP4]>;
@@ -928,6 +934,7 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureVMLxForwarding,
                                                          FeatureMP,
                                                          FeatureVFP4,
@@ -940,6 +947,7 @@ def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureVMLxForwarding]>;
 
 def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
@@ -1009,6 +1017,7 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureHasVMLxHazards,
                                                          FeatureProfUnpredicate,
                                                          FeaturePrefISHSTBarrier,
@@ -1027,6 +1036,7 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureVFP3_D16,
                                                          FeatureAvoidPartialCPSR]>;
 
@@ -1036,6 +1046,7 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHWDivARM,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureAvoidPartialCPSR]>;
 
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
@@ -1046,6 +1057,7 @@ def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHWDivARM,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureAvoidPartialCPSR]>;
 
 def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
@@ -1056,6 +1068,7 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHWDivARM,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureAvoidPartialCPSR]>;
 
 def : ProcessorModel<"cortex-m3",   CortexM4Model,      [ARMv7m,
@@ -1073,6 +1086,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
                                                          FeatureVFP4_D16_SP,
                                                          FeaturePrefLoopAlign32,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
@@ -1087,6 +1101,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
                                                          FeatureFPARMv8_D16_SP,
                                                          FeaturePrefLoopAlign32,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
@@ -1095,6 +1110,7 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model,      [ARMv8mMainline,
                                                          FeatureFPARMv8_D16_SP,
                                                          FeaturePrefLoopAlign32,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
@@ -1182,6 +1198,7 @@ def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
                                                          FeatureHasSlowFPVMLx,
+                                                         FeatureHasSlowFPVFMx,
                                                          FeatureCrypto,
                                                          FeatureUseMISched,
                                                          FeatureZCZeroing,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 714ce069483..1ee4e43398c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15018,16 +15018,19 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
 /// patterns (and we don't have the non-fused floating point instruction).
 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                    EVT VT) const {
-  if (!Subtarget->hasMVEFloatOps())
-    return false;
-
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::v4f32:
   case MVT::v8f16:
-    return true;
+    return Subtarget->hasMVEFloatOps();
+  case MVT::f16:
+    return Subtarget->useFPVFMx16();
+  case MVT::f32:
+    return Subtarget->useFPVFMx();
+  case MVT::f64:
+    return Subtarget->useFPVFMx64();
   default:
     break;
   }
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index b008d3e2e29..dea1d767beb 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -182,11 +182,9 @@ def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 // But only select them if more precision in FP computation is allowed, and when
 // they are not slower than a mul + add sequence.
 // Do not use them for Darwin platforms.
-def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast && "
-                                 " Subtarget->hasVFP4Base()) && "
-                                 "!Subtarget->isTargetDarwin() &&"
-                                 "Subtarget->useFPVMLx()">;
+def UseFusedMAC      : Predicate<"TM.Options.AllowFPOpFusion =="
+                                 "  FPOpFusion::Fast && "
+                                 "Subtarget->useFPVFMx()">;
 
 def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
 def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index f582a92f656..6bdd021970e 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -203,6 +203,10 @@ protected:
   /// whether the FP VML[AS] instructions are slow (if so, don't use them).
   bool SlowFPVMLx = false;
 
+  /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates
+  /// whether the FP VFM[AS] instructions are slow (if so, don't use them).
+  bool SlowFPVFMx = false;
+
   /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
   /// forwarding to allow mul + mla being issued back to back.
   bool HasVMLxForwarding = false;
@@ -632,6 +636,11 @@ public:
 
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool useFPVFMx() const {
+    return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx;
+  }
+  bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); }
+  bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool hasFP64() const { return HasFP64; }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index f1273b4b8b3..6888c8924fc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -69,15 +69,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
       ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
       ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
       ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
-      ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding,
-      ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR,
-      ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp,
-      ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor,
-      ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
-      ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
-      ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
-      ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
-      ARM::FeatureNoNegativeImmediates
+      ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx,
+      ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb,
+      ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR,
+      ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
+      ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
+      ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
+      ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
+      ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
+      ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
   };
 
   const ARMSubtarget *getST() const { return ST; }
diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
index 1eb7e423770..c0318a6f6df 100644
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -93,12 +93,12 @@ define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4,
 ; CHECK-SAME:  Latency=0
 
 ; CHECK-DEFAULT: VMLSS
-; CHECK-FAST:    VFMSS
-; > VMLSS common latency = 9
+; CHECK-FAST:    VFNMSS
+; > VFNMSS common latency = 9
 ; CHECK:       Latency            : 9
 ; CHECK:       Successors:
 ; CHECK:       Data
-; > VMLSS read-advanced latency to the next VMLSS = 4
+; > VFNMSS read-advanced latency to the next VMLSS = 4
 ; CHECK-SAME:  Latency=4
 
 ; CHECK-DEFAULT: VMLSS
diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
index a30b62acbac..a0d41a2f521 100644
--- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
@@ -571,7 +571,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) {
 ; CHECK:         vldr.16 s0, [r1]
 ; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vldr.16 s4, [r2]
-; CHECK-NEXT:    vmla.f16 s4, s2, s0
+; CHECK-NEXT:    vfma.f16 s4, s2, s0
 ; CHECK-NEXT:    vstr.16 s4, [r0]
 ; CHECK-NEXT:    bx lr
   %a = load half, half* %p, align 2
diff --git a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
index b6387b87262..03909b80059 100644
--- a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16 -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvmlx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvfmx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
 
 ; Check generated fp16 fused MAC and MLS.
 
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
index 05d303adb55..acafde53ac8 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
@@ -201,7 +201,7 @@ define double @fmuladd_d(double %a, double %b, double %c) {
 ; SOFT: bl __aeabi_dadd
 ; VFP4: vmul.f64
 ; VFP4: vadd.f64
-; FP-ARMv8: vmla.f64
+; FP-ARMv8: vfma.f64
   %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
   ret double %1
 }
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
index ec81164b422..1263ae15b46 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -194,7 +194,7 @@ define float @fmuladd_f(float %a, float %b, float %c) {
 ; CHECK-LABEL: fmuladd_f:
 ; SOFT: bl __aeabi_fmul
 ; SOFT: bl __aeabi_fadd
-; VMLA: vmla.f32
+; VMLA: vfma.f32
 ; NO-VMLA: vmul.f32
 ; NO-VMLA: vadd.f32
   %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)