summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorNikolai Bozhenov <nikolai.bozhenov@intel.com>2016-08-04 12:47:28 +0000
committerNikolai Bozhenov <nikolai.bozhenov@intel.com>2016-08-04 12:47:28 +0000
commitf679530ba18023d29765bde397fa77048bf17985 (patch)
tree26d32ee662bbb6f153eb39b81350d1d6859cd044 /llvm/lib/Target
parent8950cead7f2032d4dee6b17be4eb4c6b5d755403 (diff)
downloadbcm5719-llvm-f679530ba18023d29765bde397fa77048bf17985.tar.gz
bcm5719-llvm-f679530ba18023d29765bde397fa77048bf17985.zip
[X86] Heuristic to selectively build Newton-Raphson SQRT estimation
On modern Intel processors hardware SQRT in many cases is faster than RSQRT followed by Newton-Raphson refinement. The patch introduces a simple heuristic to choose between hardware SQRT instruction and Newton-Raphson software estimation. The patch treats scalars and vectors differently. The heuristic is that for scalars the compiler should optimize for latency while for vectors it should optimize for throughput. It is based on the assumption that throughput bound code is likely to be vectorized. Basically, the patch disables scalar NR for big cores and disables NR completely for Skylake. Firstly, scalar SQRT has shorter latency than NR code in big cores. Secondly, vector SQRT has been greatly improved in Skylake and has better throughput compared to NR. Differential Revision: https://reviews.llvm.org/D21379 llvm-svn: 277725
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86.td19
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp13
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h10
7 files changed, 48 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a014549297f..b628d2a4c26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 206c93cd0bc..20775df1b10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -166,6 +166,9 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
SDValue getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 84be89063ad..9b6fe6893de 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -249,6 +249,19 @@ def FeatureSoftFloat
def FeatureFastPartialYMMWrite
: SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
"true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[], [
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureXSAVEC,
FeatureXSAVES,
FeatureSGX,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
]>;
// FIXME: define SKL model
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5ca486d8719..d5be62c7a67 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 772f29f8934..d2df8291356 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1219,6 +1219,9 @@ namespace llvm {
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8f77682d227..156c0b99a2b 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment() {
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a274b797ca8..c1f862d5077 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -199,6 +199,14 @@ protected:
/// of a YMM register without clearing the upper part.
bool HasFastPartialYMMWrite;
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
@@ -434,6 +442,8 @@ public:
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
OpenPOWER on IntegriCloud