summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/ARM/ARM.td17
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp11
-rw-r--r--llvm/lib/Target/ARM/ARMPredicates.td8
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h9
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h18
-rw-r--r--llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll6
-rw-r--r--llvm/test/CodeGen/ARM/fp16-fullfp16.ll2
-rw-r--r--llvm/test/CodeGen/ARM/fp16-fusedMAC.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll2
10 files changed, 52 insertions, 25 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 66bfd4c82e2..380eaa86368 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -303,6 +303,10 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
"Disable VFP / NEON MAC instructions">;
+// VFPv4 added VFMA instructions that can similar be fast or slow.
+def FeatureHasSlowFPVFMx : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true",
+ "Disable VFP / NEON FMA instructions">;
+
// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
"HasVMLxForwarding", "true",
@@ -588,6 +592,7 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos",
FeatureHWDivThumb,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureHasRetAddrStack,
FeatureFuseLiterals,
FeatureFuseAES,
@@ -918,6 +923,7 @@ def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5,
FeatureTrustZone,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding,
FeatureMP,
FeatureVFP4]>;
@@ -928,6 +934,7 @@ def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7,
FeatureSlowFPBrcc,
FeatureHasVMLxHazards,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding,
FeatureMP,
FeatureVFP4,
@@ -940,6 +947,7 @@ def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8,
FeatureSlowFPBrcc,
FeatureHasVMLxHazards,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding]>;
def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9,
@@ -1009,6 +1017,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift,
FeatureAvoidPartialCPSR,
FeatureAvoidMOVsShOp,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureHasVMLxHazards,
FeatureProfUnpredicate,
FeaturePrefISHSTBarrier,
@@ -1027,6 +1036,7 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4,
FeatureHasRetAddrStack,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVFP3_D16,
FeatureAvoidPartialCPSR]>;
@@ -1036,6 +1046,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
@@ -1046,6 +1057,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
@@ -1056,6 +1068,7 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m,
@@ -1073,6 +1086,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em,
FeatureVFP4_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
FeatureHasNoBranchPredictor]>;
@@ -1087,6 +1101,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
FeatureFPARMv8_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
FeatureHasNoBranchPredictor]>;
@@ -1095,6 +1110,7 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
FeatureFPARMv8_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
FeatureHasNoBranchPredictor]>;
@@ -1182,6 +1198,7 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureAvoidPartialCPSR,
FeatureAvoidMOVsShOp,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureCrypto,
FeatureUseMISched,
FeatureZCZeroing,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 714ce069483..1ee4e43398c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15018,16 +15018,19 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
/// patterns (and we don't have the non-fused floating point instruction).
bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
- if (!Subtarget->hasMVEFloatOps())
- return false;
-
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::v4f32:
case MVT::v8f16:
- return true;
+ return Subtarget->hasMVEFloatOps();
+ case MVT::f16:
+ return Subtarget->useFPVFMx16();
+ case MVT::f32:
+ return Subtarget->useFPVFMx();
+ case MVT::f64:
+ return Subtarget->useFPVFMx64();
default:
break;
}
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index b008d3e2e29..dea1d767beb 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -182,11 +182,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">;
// But only select them if more precision in FP computation is allowed, and when
// they are not slower than a mul + add sequence.
// Do not use them for Darwin platforms.
-def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
- " FPOpFusion::Fast && "
- " Subtarget->hasVFP4Base()) && "
- "!Subtarget->isTargetDarwin() &&"
- "Subtarget->useFPVMLx()">;
+def UseFusedMAC : Predicate<"TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast && "
+ "Subtarget->useFPVFMx()">;
def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index f582a92f656..6bdd021970e 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -203,6 +203,10 @@ protected:
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
bool SlowFPVMLx = false;
+ /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates
+ /// whether the FP VFM[AS] instructions are slow (if so, don't use them).
+ bool SlowFPVFMx = false;
+
/// HasVMLxForwarding - If true, NEON has special multiplier accumulator
/// forwarding to allow mul + mla being issued back to back.
bool HasVMLxForwarding = false;
@@ -632,6 +636,11 @@ public:
bool useMulOps() const { return UseMulOps; }
bool useFPVMLx() const { return !SlowFPVMLx; }
+ bool useFPVFMx() const {
+ return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx;
+ }
+ bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); }
+ bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); }
bool hasVMLxForwarding() const { return HasVMLxForwarding; }
bool isFPBrccSlow() const { return SlowFPBrcc; }
bool hasFP64() const { return HasFP64; }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index f1273b4b8b3..6888c8924fc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -69,15 +69,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
- ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding,
- ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR,
- ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp,
- ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor,
- ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
- ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
- ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
- ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
- ARM::FeatureNoNegativeImmediates
+ ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx,
+ ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb,
+ ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR,
+ ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
+ ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
+ ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
+ ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
+ ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
+ ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
};
const ARMSubtarget *getST() const { return ST; }
diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
index 1eb7e423770..c0318a6f6df 100644
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -93,12 +93,12 @@ define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4,
; CHECK-SAME: Latency=0
; CHECK-DEFAULT: VMLSS
-; CHECK-FAST: VFMSS
-; > VMLSS common latency = 9
+; CHECK-FAST: VFNMSS
+; > VFNMSS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: Data
-; > VMLSS read-advanced latency to the next VMLSS = 4
+; > VFNMSS read-advanced latency to the next VMLSS = 4
; CHECK-SAME: Latency=4
; CHECK-DEFAULT: VMLSS
diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
index a30b62acbac..a0d41a2f521 100644
--- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
@@ -571,7 +571,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) {
; CHECK: vldr.16 s0, [r1]
; CHECK-NEXT: vldr.16 s2, [r0]
; CHECK-NEXT: vldr.16 s4, [r2]
-; CHECK-NEXT: vmla.f16 s4, s2, s0
+; CHECK-NEXT: vfma.f16 s4, s2, s0
; CHECK-NEXT: vstr.16 s4, [r0]
; CHECK-NEXT: bx lr
%a = load half, half* %p, align 2
diff --git a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
index b6387b87262..03909b80059 100644
--- a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16 -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvmlx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvfmx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
; Check generated fp16 fused MAC and MLS.
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
index 05d303adb55..acafde53ac8 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll
@@ -201,7 +201,7 @@ define double @fmuladd_d(double %a, double %b, double %c) {
; SOFT: bl __aeabi_dadd
; VFP4: vmul.f64
; VFP4: vadd.f64
-; FP-ARMv8: vmla.f64
+; FP-ARMv8: vfma.f64
%1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
ret double %1
}
diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
index ec81164b422..1263ae15b46 100644
--- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
+++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -194,7 +194,7 @@ define float @fmuladd_f(float %a, float %b, float %c) {
; CHECK-LABEL: fmuladd_f:
; SOFT: bl __aeabi_fmul
; SOFT: bl __aeabi_fadd
-; VMLA: vmla.f32
+; VMLA: vfma.f32
; NO-VMLA: vmul.f32
; NO-VMLA: vadd.f32
%1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
OpenPOWER on IntegriCloud