diff options
| -rw-r--r-- | llvm/include/llvm/Target/TargetLowering.h | 14 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/TargetLoweringBase.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 19 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 13 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll | 57 |
11 files changed, 115 insertions, 17 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index 4586a172e76..baeb487208d 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -243,9 +243,10 @@ public: return true; } - /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x) - bool isFsqrtCheap() const { - return FsqrtIsCheap; + /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X). + virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const { + // Default behavior is to replace SQRT(X) with X*RSQRT(X). + return false; } /// Returns true if target has indicated at least one type should be bypassed. @@ -1381,10 +1382,6 @@ protected: /// control. void setJumpIsExpensive(bool isExpensive = true); - /// Tells the code generator that fsqrt is cheap, and should not be replaced - /// with an alternative sequence of instructions. - void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; } - /// Tells the code generator that this target supports floating point /// exceptions and cares about preserving floating point exception behavior. void setHasFloatingPointExceptions(bool FPExceptions = true) { @@ -1910,9 +1907,6 @@ private: /// combined with "shift" to BitExtract instructions. bool HasExtractBitsInsn; - // Don't expand fsqrt with an approximation based on the inverse sqrt. - bool FsqrtIsCheap; - /// Tells the code generator to bypass slow divide or remainder /// instructions. For example, BypassSlowDivWidths[32,8] tells the code /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index df928451fc5..5bcea642e01 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8907,14 +8907,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { } SDValue DAGCombiner::visitFSQRT(SDNode *N) { - if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap()) + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); // TODO: FSQRT nodes should have flags that propagate to the created nodes. // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); - return buildSqrtEstimate(N->getOperand(0), &Flags); + return buildSqrtEstimate(N0, &Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 25b246d622c..4bf02664946 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -807,7 +807,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { SelectIsExpensive = false; HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; - FsqrtIsCheap = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; MaskAndBranchFoldingIsLegal = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a014549297f..b628d2a4c26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - setFsqrtIsCheap(true); - // We want to find all load dependencies for long chains of stores to enable // merging into very wide vectors. The problem is with vectors with > 4 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 206c93cd0bc..20775df1b10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -166,6 +166,9 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 84be89063ad..9b6fe6893de 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -249,6 +249,19 @@ def FeatureSoftFloat def FeatureFastPartialYMMWrite : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", "true", "Partial writes to YMM registers are fast">; +// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency +// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if +// vector FSQRT has higher throughput than the corresponding NR code. +// The idea is that throughput bound code is likely to be vectorized, so for +// vectorized code we should care about the throughput of SQRT operations. +// But if the code is scalar that probably means that the code has some kind of +// dependency and we should care more about reducing the latency. +def FeatureFastScalarFSQRT + : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", + "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +def FeatureFastVectorFSQRT + : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", + "true", "Vector SQRT is fast (disable Newton-Raphson)">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[], [ FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastScalarFSQRT ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, @@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ FeatureXSAVEC, FeatureXSAVES, FeatureSGX, - FeatureCLFLUSHOPT + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT ]>; // FIXME: define SKL model diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5ca486d8719..d5be62c7a67 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// Check if replacement of SQRT with RSQRT should be disabled. +bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // We never want to use both SQRT and RSQRT instructions for the same input. + if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) + return false; + + if (VT.isVector()) + return Subtarget.hasFastVectorFSQRT(); + return Subtarget.hasFastScalarFSQRT(); +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 772f29f8934..d2df8291356 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1219,6 +1219,9 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + /// Check if replacement of SQRT with RSQRT should be disabled. + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; + /// Use rsqrt* to speed up sqrt calculations. SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8f77682d227..156c0b99a2b 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasFastPartialYMMWrite = false; + HasFastScalarFSQRT = false; + HasFastVectorFSQRT = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index a274b797ca8..c1f862d5077 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -199,6 +199,14 @@ protected: /// of a YMM register without clearing the upper part. bool HasFastPartialYMMWrite; + /// True if hardware SQRTSS instruction is at least as fast (latency) as + /// RSQRTSS followed by a Newton-Raphson iteration. + bool HasFastScalarFSQRT; + + /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast + /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. + bool HasFastVectorFSQRT; + /// True if 8-bit divisions are significantly faster than /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; @@ -434,6 +442,8 @@ public: bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } + bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } + bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll new file mode 100644 index 00000000000..afa01b674a6 --- /dev/null +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC + +declare float @llvm.sqrt.f32(float) #0 +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0 +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0 + +define float @foo_x1(float %f) #0 { +; SCALAR-EST-LABEL: foo_x1: +; SCALAR-EST: # BB#0: +; SCALAR-EST-NEXT: rsqrtss %xmm0 +; SCALAR-EST: retq +; +; SCALAR-ACC-LABEL: foo_x1: +; SCALAR-ACC: # BB#0: +; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}} +; SCALAR-ACC-NEXT: retq + %call = tail call float @llvm.sqrt.f32(float %f) #1 + ret float %call +} + +define <4 x float> @foo_x4(<4 x float> %f) #0 { +; VECTOR-EST-LABEL: foo_x4: +; VECTOR-EST: # BB#0: +; VECTOR-EST-NEXT: rsqrtps %xmm0 +; VECTOR-EST: retq +; +; VECTOR-ACC-LABEL: foo_x4: +; VECTOR-ACC: # BB#0: +; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}} +; VECTOR-ACC-NEXT: retq + %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1 + ret <4 x float> %call +} + +define <8 x float> @foo_x8(<8 x float> %f) #0 { +; VECTOR-EST-LABEL: foo_x8: +; VECTOR-EST: # BB#0: +; VECTOR-EST-NEXT: rsqrtps +; VECTOR-EST: retq +; +; VECTOR-ACC-LABEL: foo_x8: +; VECTOR-ACC: # BB#0: +; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}} +; VECTOR-ACC-NOT: rsqrt +; VECTOR-ACC: retq + %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1 + ret <8 x float> %call +} + +attributes #0 = { "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } |

