diff options
author | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2018-10-17 07:26:35 +0000 |
---|---|---|
committer | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2018-10-17 07:26:35 +0000 |
commit | ff3ab33ec89042ed93bc8488b6c6971a1b4a9cf1 (patch) | |
tree | bbbc3869f9ff6d1e6e1b71eb1f91751975e67387 /llvm/lib/Target | |
parent | e85af163bcb54f7510f2bd582f749f0d1505ff9b (diff) | |
download | bcm5719-llvm-ff3ab33ec89042ed93bc8488b6c6971a1b4a9cf1.tar.gz bcm5719-llvm-ff3ab33ec89042ed93bc8488b6c6971a1b4a9cf1.zip |
[ARM][NFCI] Do not fuse VADD and VMUL, continued (1/2)
This is a follow up of rL342874, which stopped fusing muls and adds into VMLAs
for performance reasons on the Cortex-M4 and Cortex-M33. This is a serie of 2
patches, that is trying to achieve the same for VFMA. The second column in the
table below shows what we were generating before rL342874, the third column
what changed with rL342874, and the last column what we want to achieve with
these 2 patches:
--------------------------------------------------------
| Opt | < rL342874 | >= rL342874 | |
|------------------------------------------------------|
|-O3 | vmla | vmul | vmul |
| | | vadd | vadd |
|------------------------------------------------------|
|-Ofast | vfma | vfma | vmul |
| | | | vadd |
|------------------------------------------------------|
|-Oz | vmla | vmla | vmla |
--------------------------------------------------------
This patch 1/2, is a cleanup of the spaghetti predicate logic on the different
VMLA and VFMA codegen rules, so that we can make the final functional change in
patch 2/2. This also fixes a typo in the regression test added in rL342874.
Differential revision: https://reviews.llvm.org/D53314
llvm-svn: 344671
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrInfo.td | 9 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrVFP.td | 54 |
3 files changed, 41 insertions, 42 deletions
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index e1a077ef166..8aa05fac8a3 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -357,7 +357,10 @@ let RecomputePerFunction = 1 in { def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; - def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">; + + def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" + " !TM.Options.AllowFPOpFusion == FPOpFusion::Fast) ||" + "MF->getFunction().optForMinSize())">; } def UseMulOps : Predicate<"Subtarget->useMulOps()">; @@ -368,10 +371,6 @@ def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast && " " Subtarget->hasVFP4()) && " "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast &&" - " Subtarget->hasVFP4()) || " - "Subtarget->isTargetDarwin()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index a7bb32d31f6..2085507056b 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4402,16 +4402,16 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", v4f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", v8f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4632,16 +4632,16 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", v4f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", v8f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -7084,9 +7084,9 @@ def : N3VSPat<fadd, VADDfd>; def : N3VSPat<fsub, VSUBfd>; def : N3VSPat<fmul, VMULfd>; def : N3VSMulOpPat<fmul, fadd, VMLAfd>, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat<fmul, fsub, VMLSfd>, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat<fmul, fadd, VFMAfd>, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N3VSMulOpPat<fmul, fsub, VFMSfd>, diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index b4e28b90747..b58730c452f 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -1814,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, @@ -1823,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1836,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0, [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; def VMLSD : ADbI<0b11100, 0b00, 1, 0, @@ -1855,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, @@ -1864,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1877,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0, [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1895,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, @@ -1904,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1917,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0, [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; // (-(a * b) - dst) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; // (-dst - (a * b)) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1947,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLSS : ASbI<0b11100, 0b01, 0, 0, @@ -1955,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1967,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0, IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin), (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. |