diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 6 |
7 files changed, 51 insertions, 3 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ef357f4b7f2..72d3a59192b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7466,7 +7466,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { + unsigned &RefinementSteps, + bool &UseOneConstNR) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || @@ -7479,6 +7480,7 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++RefinementSteps; + UseOneConstNR = true; return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 39f5987b9cb..7ae3673ecbf 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -702,7 +702,8 @@ namespace llvm { SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index b635f438650..5c88b5df443 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -182,6 +182,8 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -347,7 +349,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, + FeatureUseSqrtEst]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dbe3c4aee1c..b3541545403 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX (v8f32) and AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e8e611d0719..e81a9d1209c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1017,6 +1017,11 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; }; namespace X86 { diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 413fc4b94db..82d62b4a365 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -278,6 +278,7 @@ void X86Subtarget::initializeEnvironment() { LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 24ee5538347..cdecf2a9cb2 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -192,6 +192,11 @@ protected: /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -369,6 +374,7 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } |