Use rsqrt (X86) to speed up reciprocal square root calcs

This is a first step for generating SSE rsqrt instructions for reciprocal square root calcs when fast-math is allowed. For now, be conservative and only enable this for AMD btver2 where performance improves significantly - for example, 29% on llvm/projects/test-suite/SingleSource/Benchmarks/BenchmarkGame/n-body.c (if we convert the data type to single-precision float). This patch adds a two constant version of the Newton-Raphson refinement algorithm to DAGCombiner that can be selected by any target via a parameter returned by getRsqrtEstimate().. See PR20900 for more details: http://llvm.org/bugs/show_bug.cgi?id=20900 Differential Revision: http://reviews.llvm.org/D5658 llvm-svn: 220570
author: Sanjay Patel <spatel@rotateright.com> 2014-10-24 17:02:16 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2014-10-24 17:02:16 +0000
commit: 957efc23bb87d341a1b478d87a48bb888c2d4068 (patch)
tree: 48ae584987b7970cb90899c03590938f4d622799 /llvm/lib/Target/X86/X86ISelLowering.cpp
parent: 5e3a421bfcb891fc7821daa501e30c113fb1bf16 (diff)
download: bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.tar.gz
bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.zip
1 files changed, 30 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dbe3c4aee1c..b3541545403 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // significant digits in the divisor and/or sqrt operand.
+  if (!Subtarget->useSqrtEst())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  
+  // SSE1 has rsqrtss and rsqrtps.
+  // TODO: Add support for AVX (v8f32) and AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+  // instructions: convert to single, rsqrtss, convert back to double, refine
+  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) {
+    RefinementSteps = 1;
+    UseOneConstNR = false;
+    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
 static bool isAllOnes(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   return C && C->isAllOnesValue();
author	Sanjay Patel <spatel@rotateright.com>	2014-10-24 17:02:16 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2014-10-24 17:02:16 +0000
commit	957efc23bb87d341a1b478d87a48bb888c2d4068 (patch)
tree	48ae584987b7970cb90899c03590938f4d622799 /llvm/lib/Target/X86/X86ISelLowering.cpp
parent	5e3a421bfcb891fc7821daa501e30c113fb1bf16 (diff)
download	bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.tar.gz bcm5719-llvm-957efc23bb87d341a1b478d87a48bb888c2d4068.zip