diff options
-rw-r--r-- | llvm/lib/Target/ARM/ARM.td | 17 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMPredicates.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/fp16-fullfp16.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/fp16-fusedMAC.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll | 2 |
10 files changed, 52 insertions, 25 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 66bfd4c82e2..380eaa86368 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -303,6 +303,10 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", "Disable VFP / NEON MAC instructions">; +// VFPv4 added VFMA instructions that can similar be fast or slow. +def FeatureHasSlowFPVFMx : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true", + "Disable VFP / NEON FMA instructions">; + // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", "HasVMLxForwarding", "true", @@ -588,6 +592,7 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos", FeatureHWDivThumb, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasRetAddrStack, FeatureFuseLiterals, FeatureFuseAES, @@ -918,6 +923,7 @@ def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, FeatureTrustZone, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4]>; @@ -928,6 +934,7 @@ def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4, @@ -940,6 +947,7 @@ def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding]>; def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, @@ -1009,6 +1017,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasVMLxHazards, FeatureProfUnpredicate, FeaturePrefISHSTBarrier, @@ -1027,6 +1036,7 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVFP3_D16, FeatureAvoidPartialCPSR]>; @@ -1036,6 +1046,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, @@ -1046,6 +1057,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, @@ -1056,6 +1068,7 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m, @@ -1073,6 +1086,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, FeatureVFP4_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1087,6 +1101,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1095,6 +1110,7 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1182,6 +1198,7 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureCrypto, FeatureUseMISched, FeatureZCZeroing, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 714ce069483..1ee4e43398c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15018,16 +15018,19 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, /// patterns (and we don't have the non-fused floating point instruction). bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { - if (!Subtarget->hasMVEFloatOps()) - return false; - if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::v4f32: case MVT::v8f16: - return true; + return Subtarget->hasMVEFloatOps(); + case MVT::f16: + return Subtarget->useFPVFMx16(); + case MVT::f32: + return Subtarget->useFPVFMx(); + case MVT::f64: + return Subtarget->useFPVFMx64(); default: break; } diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td index b008d3e2e29..dea1d767beb 100644 --- a/llvm/lib/Target/ARM/ARMPredicates.td +++ b/llvm/lib/Target/ARM/ARMPredicates.td @@ -182,11 +182,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">; // But only select them if more precision in FP computation is allowed, and when // they are not slower than a mul + add sequence. // Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast && " - " Subtarget->hasVFP4Base()) && " - "!Subtarget->isTargetDarwin() &&" - "Subtarget->useFPVMLx()">; +def UseFusedMAC : Predicate<"TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast && " + "Subtarget->useFPVFMx()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index f582a92f656..6bdd021970e 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -203,6 +203,10 @@ protected: /// whether the FP VML[AS] instructions are slow (if so, don't use them). bool SlowFPVMLx = false; + /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates + /// whether the FP VFM[AS] instructions are slow (if so, don't use them). + bool SlowFPVFMx = false; + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator /// forwarding to allow mul + mla being issued back to back. bool HasVMLxForwarding = false; @@ -632,6 +636,11 @@ public: bool useMulOps() const { return UseMulOps; } bool useFPVMLx() const { return !SlowFPVMLx; } + bool useFPVFMx() const { + return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx; + } + bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); } + bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); } bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool hasFP64() const { return HasFP64; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index f1273b4b8b3..6888c8924fc 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -69,15 +69,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, - ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding, - ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR, - ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp, - ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, - ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, - ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass, - ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, - ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, - ARM::FeatureNoNegativeImmediates + ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, + ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, + ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, + ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, + ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, + ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, + ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, + ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, + ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates }; const ARMSubtarget *getST() const { return ST; } diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll index 1eb7e423770..c0318a6f6df 100644 --- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll +++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll @@ -93,12 +93,12 @@ define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4, ; CHECK-SAME: Latency=0 ; CHECK-DEFAULT: VMLSS -; CHECK-FAST: VFMSS -; > VMLSS common latency = 9 +; CHECK-FAST: VFNMSS +; > VFNMSS common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: ; CHECK: Data -; > VMLSS read-advanced latency to the next VMLSS = 4 +; > VFNMSS read-advanced latency to the next VMLSS = 4 ; CHECK-SAME: Latency=4 ; CHECK-DEFAULT: VMLSS diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll index a30b62acbac..a0d41a2f521 100644 --- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll +++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll @@ -571,7 +571,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) { ; CHECK: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmla.f16 s4, s2, s0 +; CHECK-NEXT: vfma.f16 s4, s2, s0 ; CHECK-NEXT: vstr.16 s4, [r0] ; CHECK-NEXT: bx lr %a = load half, half* %p, align 2 diff --git a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll index b6387b87262..03909b80059 100644 --- a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll +++ b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16 -fp-contract=fast | FileCheck %s -; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvmlx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE +; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvfmx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE ; Check generated fp16 fused MAC and MLS. diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll index 05d303adb55..acafde53ac8 100644 --- a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll +++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll @@ -201,7 +201,7 @@ define double @fmuladd_d(double %a, double %b, double %c) { ; SOFT: bl __aeabi_dadd ; VFP4: vmul.f64 ; VFP4: vadd.f64 -; FP-ARMv8: vmla.f64 +; FP-ARMv8: vfma.f64 %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c) ret double %1 } diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll index ec81164b422..1263ae15b46 100644 --- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll +++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll @@ -194,7 +194,7 @@ define float @fmuladd_f(float %a, float %b, float %c) { ; CHECK-LABEL: fmuladd_f: ; SOFT: bl __aeabi_fmul ; SOFT: bl __aeabi_fadd -; VMLA: vmla.f32 +; VMLA: vfma.f32 ; NO-VMLA: vmul.f32 ; NO-VMLA: vadd.f32 %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) |