diff options
| author | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2018-09-24 12:02:50 +0000 |
|---|---|---|
| committer | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2018-09-24 12:02:50 +0000 |
| commit | d986ede313227a5f473f83c8ba52130255955514 (patch) | |
| tree | 80aca1adfaa4dbbbeba0d7494484d9bbfb33a951 | |
| parent | 5555c009026eba9aef07544731ef6cd421cc7ccc (diff) | |
| download | bcm5719-llvm-d986ede313227a5f473f83c8ba52130255955514.tar.gz bcm5719-llvm-d986ede313227a5f473f83c8ba52130255955514.zip | |
[ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33
A sequence of VMUL and VADD instructions always give the same or better
performance than a fused VMLA instruction on the Cortex-M4 and Cortex-M33.
Executing the VMUL and VADD back-to-back requires the same cycles, but
having separate instructions allows scheduling to avoid the hazard between
these 2 instructions.
Differential Revision: https://reviews.llvm.org/D52289
llvm-svn: 342874
| -rw-r--r-- | llvm/lib/Target/ARM/ARM.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrInfo.td | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/fmacs.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll | 5 |
4 files changed, 26 insertions, 6 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index c42a4ebf273..62a32ac12df 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -966,6 +966,7 @@ def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, FeatureVFPOnlySP, FeatureD16, FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, @@ -981,6 +982,7 @@ def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, FeatureD16, FeatureVFPOnlySP, FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-a32", [ARMv8a, diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 57e515c197f..5342b99fc11 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -353,10 +353,10 @@ def UseNegativeImmediates : let RecomputePerFunction = 1 in { def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; - def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; - def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; + def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; + def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; + def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">; } -def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; def UseMulOps : Predicate<"Subtarget->useMulOps()">; // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. diff --git a/llvm/test/CodeGen/ARM/fmacs.ll b/llvm/test/CodeGen/ARM/fmacs.ll index aa492708c0b..027991ef2c9 100644 --- a/llvm/test/CodeGen/ARM/fmacs.ll +++ b/llvm/test/CodeGen/ARM/fmacs.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9 ; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - | FileCheck %s -check-prefix=HARD +; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA +; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m33 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA define float @t1(float %acc, float %a, float %b) { entry: @@ -15,6 +17,21 @@ entry: ; A8-LABEL: t1: ; A8: vmul.f32 ; A8: vadd.f32 + +; VMLA-LABEL: t1: +; VMLA: vmul.f32 +; VMLA-NEXT: vadd.f32 + + %0 = fmul float %a, %b + %1 = fadd float %acc, %0 + ret float %1 +} + +define float @vlma_minsize(float %acc, float %a, float %b) #0 { +entry: +; VMLA-LABEL: vlma_minsize: +; VLMA: vmla.f32 s0, s1, s2 + %0 = fmul float %a, %b %1 = fadd float %acc, %0 ret float %1 @@ -102,3 +119,5 @@ entry: %3 = fadd float %1, %2 ret float %3 } + +attributes #0 = { minsize nounwind optsize } diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll index 847aeacd2f9..8ee2af03eca 100644 --- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll +++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE -; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m33 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA @@ -188,8 +189,6 @@ define float @round_f(float %a) { ret float %1 } -; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd? -; (these should be equivalent, even the rounding is the same) declare float @llvm.fmuladd.f32(float %a, float %b, float %c) define float @fmuladd_f(float %a, float %b, float %c) { ; CHECK-LABEL: fmuladd_f: |

