summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorRafael Espindola <rafael.espindola@gmail.com>2015-06-03 05:32:44 +0000
committerRafael Espindola <rafael.espindola@gmail.com>2015-06-03 05:32:44 +0000
commitcf8beece97c5b8695fa1e8a5f5727ad730424595 (patch)
treee66e2cd3be4fa4de338662514ebddbd233f822a3 /llvm/lib/Target/X86
parent75d5b5495f1514e239c1b18254ddcf7a297e80ee (diff)
downloadbcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.tar.gz
bcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.zip
Revert "make reciprocal estimate code generation more flexible by adding command-line options (2nd try)"
This reverts commit r238842. It broke -DBUILD_SHARED_LIBS=ON build. llvm-svn: 238900
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/X86.td6
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp68
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h12
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp7
5 files changed, 54 insertions, 41 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 7001910a61e..c70e2e95463 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -188,6 +188,10 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
+def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
+ "Use RSQRT* to optimize square root calculations">;
+def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
+ "true", "Use RCP* to optimize division calculations">;
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;
@@ -440,7 +444,7 @@ def : ProcessorModel<"btver2", BtVer2Model,
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
FeatureBMI, FeatureF16C, FeatureMOVBE,
FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
- FeatureSlowSHLD]>;
+ FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 248c452d544..a4787c81661 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -67,6 +67,12 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
+static cl::opt<int> ReciprocalEstimateRefinementSteps(
+ "x86-recip-refinement-steps", cl::init(1),
+ cl::desc("Specify the number of Newton-Raphson iterations applied to the "
+ "result of the hardware reciprocal estimate instruction."),
+ cl::NotHidden);
+
// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
@@ -13000,31 +13006,29 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor and/or sqrt operand.
+ if (!Subtarget->useSqrtEst())
+ return SDValue();
+
EVT VT = Op.getValueType();
- const char *RecipOp;
- // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+ // SSE1 has rsqrtss and rsqrtps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget->hasSSE1())
- RecipOp = "sqrtf";
- else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget->hasAVX()))
- RecipOp = "vec-sqrtf";
- else
- return SDValue();
-
- TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
- if (!Recips.isEnabled(RecipOp))
- return SDValue();
-
- RefinementSteps = Recips.getRefinementSteps(RecipOp);
- UseOneConstNR = false;
- return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = 1;
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
}
/// The minimum architected relative accuracy is 2^-12. We need one
@@ -13032,9 +13036,15 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor.
+ if (!Subtarget->useReciprocalEst())
+ return SDValue();
+
EVT VT = Op.getValueType();
- const char *RecipOp;
-
+
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
@@ -13042,20 +13052,12 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget->hasSSE1())
- RecipOp = "divf";
- else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget->hasAVX()))
- RecipOp = "vec-divf";
- else
- return SDValue();
-
- TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
- if (!Recips.isEnabled(RecipOp))
- return SDValue();
-
- RefinementSteps = Recips.getRefinementSteps(RecipOp);
- return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = ReciprocalEstimateRefinementSteps;
+ return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
}
/// If we have at least two divisions that use the same divisor, convert to
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 65e702e7f35..1cdab14e034 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -273,6 +273,8 @@ void X86Subtarget::initializeEnvironment() {
LEAUsesAG = false;
SlowLEA = false;
SlowIncDec = false;
+ UseSqrtEst = false;
+ UseReciprocalEst = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 27429d050bd..455dd7744d7 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -190,6 +190,16 @@ protected:
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
+ /// Use the RSQRT* instructions to optimize square root calculations.
+ /// For this to be profitable, the cost of FSQRT and FDIV must be
+ /// substantially higher than normal FP ops like FADD and FMUL.
+ bool UseSqrtEst;
+
+ /// Use the RCP* instructions to optimize FP division calculations.
+ /// For this to be profitable, the cost of FDIV must be
+ /// substantially higher than normal FP ops like FADD and FMUL.
+ bool UseReciprocalEst;
+
/// Processor has AVX-512 PreFetch Instructions
bool HasPFI;
@@ -367,6 +377,8 @@ public:
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
bool slowIncDec() const { return SlowIncDec; }
+ bool useSqrtEst() const { return UseSqrtEst; }
+ bool useReciprocalEst() const { return UseReciprocalEst; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 646cff7c5bd..4116cef9b07 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -105,13 +105,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
if (Subtarget.isTargetWin64())
this->Options.TrapUnreachable = true;
- // TODO: By default, all reciprocal estimate operations are off because
- // that matches the behavior before TargetRecip was added (except for btver2
- // which used subtarget features to enable this type of codegen).
- // We should change this to match GCC behavior where everything but
- // scalar division estimates are turned on by default with -ffast-math.
- this->Options.Reciprocals.setDefaults("all", false, 1);
-
initAsmInfo();
}
OpenPOWER on IntegriCloud