diff options
| author | Rafael Espindola <rafael.espindola@gmail.com> | 2015-06-03 05:32:44 +0000 |
|---|---|---|
| committer | Rafael Espindola <rafael.espindola@gmail.com> | 2015-06-03 05:32:44 +0000 |
| commit | cf8beece97c5b8695fa1e8a5f5727ad730424595 (patch) | |
| tree | e66e2cd3be4fa4de338662514ebddbd233f822a3 /llvm/lib/Target/X86 | |
| parent | 75d5b5495f1514e239c1b18254ddcf7a297e80ee (diff) | |
| download | bcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.tar.gz bcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.zip | |
Revert "make reciprocal estimate code generation more flexible by adding command-line options (2nd try)"
This reverts commit r238842.
It broke -DBUILD_SHARED_LIBS=ON build.
llvm-svn: 238900
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 68 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 7 |
5 files changed, 54 insertions, 41 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 7001910a61e..c70e2e95463 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -188,6 +188,10 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; +def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", + "true", "Use RCP* to optimize division calculations">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; @@ -440,7 +444,7 @@ def : ProcessorModel<"btver2", BtVer2Model, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD]>; + FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 248c452d544..a4787c81661 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -67,6 +67,12 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); +static cl::opt<int> ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -13000,31 +13006,29 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + EVT VT = Op.getValueType(); - const char *RecipOp; - // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. + // SSE1 has rsqrtss and rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) - RecipOp = "sqrtf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) - RecipOp = "vec-sqrtf"; - else - return SDValue(); - - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); - - RefinementSteps = Recips.getRefinementSteps(RecipOp); - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -13032,9 +13036,15 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + EVT VT = Op.getValueType(); - const char *RecipOp; - + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -13042,20 +13052,12 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) - RecipOp = "divf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) - RecipOp = "vec-divf"; - else - return SDValue(); - - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); - - RefinementSteps = Recips.getRefinementSteps(RecipOp); - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = ReciprocalEstimateRefinementSteps; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 65e702e7f35..1cdab14e034 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -273,6 +273,8 @@ void X86Subtarget::initializeEnvironment() { LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; + UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 27429d050bd..455dd7744d7 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -190,6 +190,16 @@ protected: /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + + /// Use the RCP* instructions to optimize FP division calculations. + /// For this to be profitable, the cost of FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseReciprocalEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -367,6 +377,8 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } + bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 646cff7c5bd..4116cef9b07 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -105,13 +105,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; - // TODO: By default, all reciprocal estimate operations are off because - // that matches the behavior before TargetRecip was added (except for btver2 - // which used subtarget features to enable this type of codegen). - // We should change this to match GCC behavior where everything but - // scalar division estimates are turned on by default with -ffast-math. - this->Options.Reciprocals.setDefaults("all", false, 1); - initAsmInfo(); } |

