diff options
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/fadd-combines.ll | 78 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll | 8 |
3 files changed, 100 insertions, 4 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 03698ac862a..c2be9f3f058 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9469,6 +9469,14 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { return SDValue(); } +static bool isFMulNegTwo(SDValue &N) { + if (N.getOpcode() != ISD::FMUL) + return false; + if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) + return CFP->isExactlyValue(-2.0); + return false; +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -9507,6 +9515,16 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); + // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) + // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) + if ((isFMulNegTwo(N0) && N0.hasOneUse()) || + (isFMulNegTwo(N1) && N1.hasOneUse())) { + bool N1IsFMul = isFMulNegTwo(N1); + SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); + } + // FIXME: Auto-upgrade the target/function-level option. if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // fold (fadd A, 0) -> A diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll new file mode 100644 index 00000000000..c106f293ccf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -0,0 +1,78 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: test1: +; CHECK: fadd d1, d1, d1 +; CHECK: fsub d0, d0, d1 +define double @test1(double %a, double %b) local_unnamed_addr #0 { +entry: + %mul = fmul double %b, -2.000000e+00 + %add1 = fadd double %a, %mul + ret double %add1 +} + +; DAGCombine will canonicalize 'a - 2.0*b' to 'a + -2.0*b' +; CHECK-LABEL: test2: +; CHECK: fadd d1, d1, d1 +; CHECK: fsub d0, d0, d1 +define double @test2(double %a, double %b) local_unnamed_addr #0 { +entry: + %mul = fmul double %b, 2.000000e+00 + %add1 = fsub double %a, %mul + ret double %add1 +} + +; CHECK-LABEL: test3: +; CHECK: fmul d0, d0, d1 +; CHECK: fadd d1, d2, d2 +; CHECK: fsub d0, d0, d1 +define double @test3(double %a, double %b, double %c) local_unnamed_addr #0 { +entry: + %mul = fmul double %a, %b + %mul1 = fmul double %c, 2.000000e+00 + %sub = fsub double %mul, %mul1 + ret double %sub +} + +; CHECK-LABEL: test4: +; CHECK: fmul d0, d0, d1 +; CHECK: fadd d1, d2, d2 +; CHECK: fsub d0, d0, d1 +define double @test4(double %a, double %b, double %c) local_unnamed_addr #0 { +entry: + %mul = fmul double %a, %b + %mul1 = fmul double %c, -2.000000e+00 + %add2 = fadd double %mul, %mul1 + ret double %add2 +} + +; CHECK-LABEL: test5: +; CHECK: fadd v1.4s, v1.4s, v1.4s +; CHECK: fsub v0.4s, v0.4s, v1.4s +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %mul = fmul <4 x float> %b, <float -2.0, float -2.0, float -2.0, float -2.0> + %add = fadd <4 x float> %a, %mul + ret <4 x float> %add +} + +; CHECK-LABEL: test6: +; CHECK: fadd v1.4s, v1.4s, v1.4s +; CHECK: fsub v0.4s, v0.4s, v1.4s +define <4 x float> @test6(<4 x float> %a, <4 x float> %b) { + %mul = fmul <4 x float> %b, <float 2.0, float 2.0, float 2.0, float 2.0> + %add = fsub <4 x float> %a, %mul + ret <4 x float> %add +} + +; Don't fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) if the fmul has +; multiple uses. +; CHECK-LABEL: test7: +; CHECK: fmul +define double @test7(double %a, double %b) local_unnamed_addr #0 { +entry: + %mul = fmul double %b, -2.000000e+00 + %add1 = fadd double %a, %mul + call void @use(double %mul) + ret double %add1 +} + +declare void @use(double) diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll index fb605dd2e4b..e4225502669 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -191,8 +191,8 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -251,8 +251,8 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] |