summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp117
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86.td5
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp30
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h5
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp1
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h6
8 files changed, 128 insertions, 43 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6fe4ade4aa8..4ad2e5dd492 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -331,6 +331,8 @@ namespace {
SDValue BuildUDIV(SDNode *N);
SDValue BuildReciprocalEstimate(SDValue Op);
SDValue BuildRsqrtEstimate(SDValue Op);
+ SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations);
+ SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations);
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -7033,13 +7035,11 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) {
- AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
}
} else if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
- AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
@@ -7047,7 +7047,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
} else if (N1.getOpcode() == ISD::FP_ROUND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
- AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
@@ -7068,7 +7067,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// We found a FSQRT, so try to make this fold:
// x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
- AddToWorklist(RV.getNode());
RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
@@ -7116,7 +7114,6 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
if (DAG.getTarget().Options.UnsafeFPMath) {
// Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
- AddToWorklist(RV.getNode());
EVT VT = RV.getValueType();
RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
AddToWorklist(RV.getNode());
@@ -11985,49 +11982,89 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
return SDValue();
}
-SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
- if (Level >= AfterLegalizeDAG)
- return SDValue();
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+/// =>
+/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+/// As a result, we precompute A/2 prior to the iteration loop.
+SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
+ unsigned Iterations) {
+ EVT VT = Arg.getValueType();
+ SDLoc DL(Arg);
+ SDValue ThreeHalves = DAG.getConstantFP(1.5, VT);
+
+ // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
+ // this entire sequence requires only one FP constant.
+ SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+ AddToWorklist(HalfArg.getNode());
+
+ HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
+ AddToWorklist(HalfArg.getNode());
+
+ // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+ for (unsigned i = 0; i < Iterations; ++i) {
+ SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+ AddToWorklist(NewEst.getNode());
+
+ NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+ AddToWorklist(NewEst.getNode());
+
+ NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+ AddToWorklist(NewEst.getNode());
+
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+ AddToWorklist(Est.getNode());
+ }
+ return Est;
+}
- // Expose the DAG combiner to the target combiner implementations.
- TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
- unsigned Iterations = 0;
- if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations)) {
- if (Iterations) {
- // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
- // For the reciprocal sqrt, we need to find the zero of the function:
- // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
- // =>
- // X_{i+1} = X_i (1.5 - A X_i^2 / 2)
- // As a result, we precompute A/2 prior to the iteration loop.
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
- SDValue FPThreeHalves = DAG.getConstantFP(1.5, VT);
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+/// =>
+/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
+ unsigned Iterations) {
+ EVT VT = Arg.getValueType();
+ SDLoc DL(Arg);
+ SDValue MinusThree = DAG.getConstantFP(-3.0, VT);
+ SDValue MinusHalf = DAG.getConstantFP(-0.5, VT);
- AddToWorklist(Est.getNode());
+ // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
+ for (unsigned i = 0; i < Iterations; ++i) {
+ SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
+ AddToWorklist(HalfEst.getNode());
- // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
- // this entire sequence requires only one FP constant.
- SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, FPThreeHalves, Op);
- AddToWorklist(HalfArg.getNode());
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+ AddToWorklist(Est.getNode());
- HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Op);
- AddToWorklist(HalfArg.getNode());
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
+ AddToWorklist(Est.getNode());
- // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
- for (unsigned i = 0; i < Iterations; ++i) {
- SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
- AddToWorklist(NewEst.getNode());
+ Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
+ AddToWorklist(Est.getNode());
- NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
- AddToWorklist(NewEst.getNode());
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
+ AddToWorklist(Est.getNode());
+ }
+ return Est;
+}
- NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPThreeHalves, NewEst);
- AddToWorklist(NewEst.getNode());
+SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
+ if (Level >= AfterLegalizeDAG)
+ return SDValue();
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
- AddToWorklist(Est.getNode());
- }
+ // Expose the DAG combiner to the target combiner implementations.
+ TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+ unsigned Iterations = 0;
+ bool UseOneConstNR = false;
+ if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
+ AddToWorklist(Est.getNode());
+ if (Iterations) {
+ Est = UseOneConstNR ?
+ BuildRsqrtNROneConst(Op, Est, Iterations) :
+ BuildRsqrtNRTwoConst(Op, Est, Iterations);
}
return Est;
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ef357f4b7f2..72d3a59192b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7466,7 +7466,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const {
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
EVT VT = Operand.getValueType();
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
@@ -7479,6 +7480,7 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
if (VT.getScalarType() == MVT::f64)
++RefinementSteps;
+ UseOneConstNR = true;
return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
return SDValue();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 39f5987b9cb..7ae3673ecbf 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -702,7 +702,8 @@ namespace llvm {
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const override;
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const override;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index b635f438650..5c88b5df443 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -182,6 +182,8 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
+def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
+ "Use RSQRT* to optimize square root calculations">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -347,7 +349,8 @@ def : ProcessorModel<"btver2", BtVer2Model,
[FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
FeatureBMI, FeatureF16C, FeatureMOVBE,
- FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+ FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
+ FeatureUseSqrtEst]>;
// Bulldozer
def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dbe3c4aee1c..b3541545403 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor and/or sqrt operand.
+ if (!Subtarget->useSqrtEst())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps.
+ // TODO: Add support for AVX (v8f32) and AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) {
+ RefinementSteps = 1;
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
static bool isAllOnes(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isAllOnesValue();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e8e611d0719..e81a9d1209c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1017,6 +1017,11 @@ namespace llvm {
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+ /// Use rsqrt* to speed up sqrt calculations.
+ SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
};
namespace X86 {
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 413fc4b94db..82d62b4a365 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -278,6 +278,7 @@ void X86Subtarget::initializeEnvironment() {
LEAUsesAG = false;
SlowLEA = false;
SlowIncDec = false;
+ UseSqrtEst = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 24ee5538347..cdecf2a9cb2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -192,6 +192,11 @@ protected:
/// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
+ /// Use the RSQRT* instructions to optimize square root calculations.
+ /// For this to be profitable, the cost of FSQRT and FDIV must be
+ /// substantially higher than normal FP ops like FADD and FMUL.
+ bool UseSqrtEst;
+
/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;
@@ -369,6 +374,7 @@ public:
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
bool slowIncDec() const { return SlowIncDec; }
+ bool useSqrtEst() const { return UseSqrtEst; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
OpenPOWER on IntegriCloud