diff options
author | Sanjay Patel <spatel@rotateright.com> | 2014-10-24 17:02:16 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2014-10-24 17:02:16 +0000 |
commit | 957efc23bb87d341a1b478d87a48bb888c2d4068 (patch) | |
tree | 48ae584987b7970cb90899c03590938f4d622799 /llvm/lib/Target/X86/X86ISelLowering.cpp | |
parent | 5e3a421bfcb891fc7821daa501e30c113fb1bf16 (diff) | |
download | bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.tar.gz bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.zip |
Use rsqrt (X86) to speed up reciprocal square root calcs
This is a first step for generating SSE rsqrt instructions for
reciprocal square root calcs when fast-math is allowed.
For now, be conservative and only enable this for AMD btver2
where performance improves significantly - for example, 29%
on llvm/projects/test-suite/SingleSource/Benchmarks/BenchmarkGame/n-body.c
(if we convert the data type to single-precision float).
This patch adds a two constant version of the Newton-Raphson
refinement algorithm to DAGCombiner that can be selected by any target
via a parameter returned by getRsqrtEstimate()..
See PR20900 for more details:
http://llvm.org/bugs/show_bug.cgi?id=20900
Differential Revision: http://reviews.llvm.org/D5658
llvm-svn: 220570
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dbe3c4aee1c..b3541545403 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX (v8f32) and AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); |