summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp8
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86.td19
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp13
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h10
9 files changed, 54 insertions, 7 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df928451fc5..5bcea642e01 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8907,14 +8907,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
}
SDValue DAGCombiner::visitFSQRT(SDNode *N) {
- if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (TLI.isFsqrtCheap(N0, DAG))
return SDValue();
// TODO: FSQRT nodes should have flags that propagate to the created nodes.
// For now, create a Flags object for use with all unsafe math transforms.
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
- return buildSqrtEstimate(N->getOperand(0), &Flags);
+ return buildSqrtEstimate(N0, &Flags);
}
/// copysign(x, fp_extend(y)) -> copysign(x, y)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 25b246d622c..4bf02664946 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -807,7 +807,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
SelectIsExpensive = false;
HasMultipleConditionRegisters = false;
HasExtractBitsInsn = false;
- FsqrtIsCheap = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
MaskAndBranchFoldingIsLegal = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a014549297f..b628d2a4c26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 206c93cd0bc..20775df1b10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -166,6 +166,9 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
SDValue getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 84be89063ad..9b6fe6893de 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -249,6 +249,19 @@ def FeatureSoftFloat
def FeatureFastPartialYMMWrite
: SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
"true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[], [
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureXSAVEC,
FeatureXSAVES,
FeatureSGX,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
]>;
// FIXME: define SKL model
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5ca486d8719..d5be62c7a67 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 772f29f8934..d2df8291356 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1219,6 +1219,9 @@ namespace llvm {
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8f77682d227..156c0b99a2b 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment() {
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a274b797ca8..c1f862d5077 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -199,6 +199,14 @@ protected:
/// of a YMM register without clearing the upper part.
bool HasFastPartialYMMWrite;
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
@@ -434,6 +442,8 @@ public:
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
OpenPOWER on IntegriCloud