summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2015-11-24 20:31:46 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2015-11-24 20:31:46 +0000
commit1b4fecb098a5407734128d628733d1fbd86e054d (patch)
treec83d383022a41d6f007800e7d964957967ab7417
parent29ffb68259bfefe0a2ec649f0912cba4f71e85ba (diff)
downloadbcm5719-llvm-1b4fecb098a5407734128d628733d1fbd86e054d.tar.gz
bcm5719-llvm-1b4fecb098a5407734128d628733d1fbd86e054d.zip
[X86][FMA] Optimize FNEG(FMA) Patterns
X86 needs to use its own FMA opcodes, preventing the standard FNEG(FMA) pattern table recognition method used by other platforms. This patch adds support for lowering FNEG(FMA(X,Y,Z)) into a single suitably negated FMA instruction. Fix for PR24364 Differential Revision: http://reviews.llvm.org/D14906 llvm-svn: 254016
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp29
-rw-r--r--llvm/test/CodeGen/X86/fma_patterns.ll68
2 files changed, 97 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 59e16d0a094..0bf5ee68d13 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1773,6 +1773,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
@@ -26148,6 +26149,33 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Do target-specific dag combines on floating point negations.
+static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Arg = N->getOperand(0);
+
+ // If we're negating a FMA node, then we can adjust the
+ // instruction to include the extra negation.
+ if (Arg.hasOneUse()) {
+ switch (Arg.getOpcode()) {
+ case X86ISD::FMADD:
+ return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FMSUB:
+ return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMADD:
+ return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMSUB:
+ return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ }
+ }
+ return SDValue();
+}
+
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
@@ -27042,6 +27070,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
+ case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index c8667d7b8ad..9c8b12d63fa 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -568,6 +568,74 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
ret <4 x double> %r
}
+; (fneg (fma x, y, z)) -> (fma x, -y, -z)
+
+define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK_FMA-LABEL: test_v4f32_fneg_fmadd:
+; CHECK_FMA: # BB#0:
+; CHECK_FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK_FMA-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_fneg_fmadd:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %mul = fmul <4 x float> %a0, %a1
+ %add = fadd <4 x float> %mul, %a2
+ %neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+ ret <4 x float> %neg
+}
+
+define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK_FMA-LABEL: test_v4f64_fneg_fmsub:
+; CHECK_FMA: # BB#0:
+; CHECK_FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK_FMA-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f64_fneg_fmsub:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4-NEXT: retq
+ %mul = fmul <4 x double> %a0, %a1
+ %sub = fsub <4 x double> %mul, %a2
+ %neg = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+ ret <4 x double> %neg
+}
+
+define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK_FMA-LABEL: test_v4f32_fneg_fnmadd:
+; CHECK_FMA: # BB#0:
+; CHECK_FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK_FMA-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f32_fneg_fnmadd:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4-NEXT: retq
+ %mul = fmul <4 x float> %a0, %a1
+ %neg0 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
+ %add = fadd <4 x float> %neg0, %a2
+ %neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+ ret <4 x float> %neg1
+}
+
+define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK_FMA-LABEL: test_v4f64_fneg_fnmsub:
+; CHECK_FMA: # BB#0:
+; CHECK_FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK_FMA-NEXT: retq
+;
+; CHECK_FMA4-LABEL: test_v4f64_fneg_fnmsub:
+; CHECK_FMA4: # BB#0:
+; CHECK_FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4-NEXT: retq
+ %mul = fmul <4 x double> %a0, %a1
+ %neg0 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
+ %sub = fsub <4 x double> %neg0, %a2
+ %neg1 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+ ret <4 x double> %neg1
+}
+
; (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
OpenPOWER on IntegriCloud