summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp30
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h11
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td24
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll13
4 files changed, 41 insertions, 37 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6e511e68d7a..4464fd1be05 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14858,6 +14858,36 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
+/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+/// expanded to FMAs when this method returns true, otherwise fmuladd is
+/// expanded to fmul + fadd.
+///
+/// ARM supports both fused and unfused multiply-add operations; we already
+/// lower a pair of fmul and fadd to the latter so it's not clear that there
+/// would be a gain or that the gain would be worthwhile enough to risk
+/// correctness bugs.
+///
+/// For MVE, we set this to true as it helps simplify the need for some
+/// patterns (and we don't have the non-fused floating point instruction).
+bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!Subtarget->hasMVEFloatOps())
+ return false;
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v4f32:
+ case MVT::v8f16:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if (V < 0)
return false;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 53813fad5af..d3caed884a3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -738,16 +738,7 @@ class VectorType;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
- /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
- /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
- /// expanded to FMAs when this method returns true, otherwise fmuladd is
- /// expanded to fmul + fadd.
- ///
- /// ARM supports both fused and unfused multiply-add operations; we already
- /// lower a pair of fmul and fadd to the latter so it's not clear that there
- /// would be a gain or that the gain would be worthwhile enough to risk
- /// correctness bugs.
- bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index e43d64393a6..040b6f64832 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2808,31 +2808,15 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-let Predicates = [HasMVEFloat, UseFusedMAC] in {
- def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
- (fmul (v8f16 MQPR:$src2),
- (v8f16 MQPR:$src3)))),
- (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
- def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
- (fmul (v4f32 MQPR:$src2),
- (v4f32 MQPR:$src3)))),
- (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
-
- def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
- (fmul (v8f16 MQPR:$src2),
- (v8f16 MQPR:$src3)))),
- (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
- def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
- (fmul (v4f32 MQPR:$src2),
- (v4f32 MQPR:$src3)))),
- (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
-}
-
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
(v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
(v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+ def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+ (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
+ def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+ (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
}
multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 234328cb600..1b4b040573a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -234,7 +234,7 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: cbz r2, .LBB1_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: adds r3, r2, #3
-; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
@@ -242,21 +242,20 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q1, [r0]
-; CHECK-NEXT: vldrwt.u32 q2, [r1]
+; CHECK-NEXT: vldrwt.u32 q2, [r0]
+; CHECK-NEXT: vldrwt.u32 q3, [r1]
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: vmul.f32 q1, q2, q1
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vadd.f32 q1, q1, q0
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vfma.f32 q0, q3, q2
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vctp.32 r3
-; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vadd.f32 q0, q0, q1
OpenPOWER on IntegriCloud