diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/CodeGen/TargetLoweringBase.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 19 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 10 |
9 files changed, 54 insertions, 7 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index df928451fc5..5bcea642e01 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8907,14 +8907,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { } SDValue DAGCombiner::visitFSQRT(SDNode *N) { - if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap()) + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); // TODO: FSQRT nodes should have flags that propagate to the created nodes. // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); - return buildSqrtEstimate(N->getOperand(0), &Flags); + return buildSqrtEstimate(N0, &Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 25b246d622c..4bf02664946 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -807,7 +807,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { SelectIsExpensive = false; HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; - FsqrtIsCheap = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; MaskAndBranchFoldingIsLegal = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a014549297f..b628d2a4c26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - setFsqrtIsCheap(true); - // We want to find all load dependencies for long chains of stores to enable // merging into very wide vectors. The problem is with vectors with > 4 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 206c93cd0bc..20775df1b10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -166,6 +166,9 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 84be89063ad..9b6fe6893de 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -249,6 +249,19 @@ def FeatureSoftFloat def FeatureFastPartialYMMWrite : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", "true", "Partial writes to YMM registers are fast">; +// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency +// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if +// vector FSQRT has higher throughput than the corresponding NR code. +// The idea is that throughput bound code is likely to be vectorized, so for +// vectorized code we should care about the throughput of SQRT operations. +// But if the code is scalar that probably means that the code has some kind of +// dependency and we should care more about reducing the latency. +def FeatureFastScalarFSQRT + : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", + "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +def FeatureFastVectorFSQRT + : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", + "true", "Vector SQRT is fast (disable Newton-Raphson)">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[], [ FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastScalarFSQRT ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, @@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ FeatureXSAVEC, FeatureXSAVES, FeatureSGX, - FeatureCLFLUSHOPT + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT ]>; // FIXME: define SKL model diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5ca486d8719..d5be62c7a67 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// Check if replacement of SQRT with RSQRT should be disabled. +bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // We never want to use both SQRT and RSQRT instructions for the same input. + if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) + return false; + + if (VT.isVector()) + return Subtarget.hasFastVectorFSQRT(); + return Subtarget.hasFastScalarFSQRT(); +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 772f29f8934..d2df8291356 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1219,6 +1219,9 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + /// Check if replacement of SQRT with RSQRT should be disabled. + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; + /// Use rsqrt* to speed up sqrt calculations. SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8f77682d227..156c0b99a2b 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasFastPartialYMMWrite = false; + HasFastScalarFSQRT = false; + HasFastVectorFSQRT = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index a274b797ca8..c1f862d5077 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -199,6 +199,14 @@ protected: /// of a YMM register without clearing the upper part. bool HasFastPartialYMMWrite; + /// True if hardware SQRTSS instruction is at least as fast (latency) as + /// RSQRTSS followed by a Newton-Raphson iteration. + bool HasFastScalarFSQRT; + + /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast + /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. + bool HasFastVectorFSQRT; + /// True if 8-bit divisions are significantly faster than /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; @@ -434,6 +442,8 @@ public: bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } + bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } + bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } |