diff options
author | Rafael Espindola <rafael.espindola@gmail.com> | 2015-06-03 05:32:44 +0000 |
---|---|---|
committer | Rafael Espindola <rafael.espindola@gmail.com> | 2015-06-03 05:32:44 +0000 |
commit | cf8beece97c5b8695fa1e8a5f5727ad730424595 (patch) | |
tree | e66e2cd3be4fa4de338662514ebddbd233f822a3 /llvm/lib | |
parent | 75d5b5495f1514e239c1b18254ddcf7a297e80ee (diff) | |
download | bcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.tar.gz bcm5719-llvm-cf8beece97c5b8695fa1e8a5f5727ad730424595.zip |
Revert "make reciprocal estimate code generation more flexible by adding command-line options (2nd try)"
This reverts commit r238842.
It broke -DBUILD_SHARED_LIBS=ON build.
llvm-svn: 238900
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/TargetRecip.cpp | 225 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 68 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 12 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 7 |
7 files changed, 54 insertions, 267 deletions
diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt index e6d0199952f..1805437b12f 100644 --- a/llvm/lib/Target/CMakeLists.txt +++ b/llvm/lib/Target/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_library(LLVMTarget TargetLoweringObjectFile.cpp TargetMachine.cpp TargetMachineC.cpp - TargetRecip.cpp TargetSubtargetInfo.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Target/TargetRecip.cpp b/llvm/lib/Target/TargetRecip.cpp deleted file mode 100644 index 42bc487fe6d..00000000000 --- a/llvm/lib/Target/TargetRecip.cpp +++ /dev/null @@ -1,225 +0,0 @@ -//===-------------------------- TargetRecip.cpp ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class is used to customize machine-specific reciprocal estimate code -// generation in a target-independent way. -// If a target does not support operations in this specification, then code -// generation will default to using supported operations. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetRecip.h" -#include <map> - -using namespace llvm; - -// These are the names of the individual reciprocal operations. These are -// the key strings for queries and command-line inputs. -// In addition, the command-line interface recognizes the global parameters -// "all", "none", and "default". -static const char *RecipOps[] = { - "divd", - "divf", - "vec-divd", - "vec-divf", - "sqrtd", - "sqrtf", - "vec-sqrtd", - "vec-sqrtf", -}; - -// The uninitialized state is needed for the enabled settings and refinement -// steps because custom settings may arrive via the command-line before target -// defaults are set. -TargetRecip::TargetRecip() { - unsigned NumStrings = llvm::array_lengthof(RecipOps); - for (unsigned i = 0; i < NumStrings; ++i) - RecipMap.insert(std::make_pair(RecipOps[i], RecipParams())); -} - -static bool parseRefinementStep(const StringRef &In, size_t &Position, - uint8_t &Value) { - const char RefStepToken = ':'; - Position = In.find(RefStepToken); - if (Position == StringRef::npos) - return false; - - StringRef RefStepString = In.substr(Position + 1); - // Allow exactly one numeric character for the additional refinement - // step parameter. - if (RefStepString.size() == 1) { - char RefStepChar = RefStepString[0]; - if (RefStepChar >= '0' && RefStepChar <= '9') { - Value = RefStepChar - '0'; - return true; - } - } - report_fatal_error("Invalid refinement step for -recip."); -} - -bool TargetRecip::parseGlobalParams(const std::string &Arg) { - StringRef ArgSub = Arg; - - // Look for an optional setting of the number of refinement steps needed - // for this type of reciprocal operation. - size_t RefPos; - uint8_t RefSteps; - StringRef RefStepString; - if (parseRefinementStep(ArgSub, RefPos, RefSteps)) { - // Split the string for further processing. - RefStepString = ArgSub.substr(RefPos + 1); - ArgSub = ArgSub.substr(0, RefPos); - } - bool Enable; - bool UseDefaults; - if (ArgSub == "all") { - UseDefaults = false; - Enable = true; - } else if (ArgSub == "none") { - UseDefaults = false; - Enable = false; - } else if (ArgSub == "default") { - UseDefaults = true; - } else { - // Any other string is invalid or an individual setting. - return false; - } - - // All enable values will be initialized to target defaults if 'default' was - // specified. - if (!UseDefaults) - for (auto &KV : RecipMap) - KV.second.Enabled = Enable; - - // Custom refinement count was specified with all, none, or default. - if (!RefStepString.empty()) - for (auto &KV : RecipMap) - KV.second.RefinementSteps = RefSteps; - - return true; -} - -void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) { - static const char DisabledPrefix = '!'; - unsigned NumArgs = Args.size(); - - for (unsigned i = 0; i != NumArgs; ++i) { - StringRef Val = Args[i]; - - bool IsDisabled = Val[0] == DisabledPrefix; - // Ignore the disablement token for string matching. - if (IsDisabled) - Val = Val.substr(1); - - size_t RefPos; - uint8_t RefSteps; - StringRef RefStepString; - if (parseRefinementStep(Val, RefPos, RefSteps)) { - // Split the string for further processing. - RefStepString = Val.substr(RefPos + 1); - Val = Val.substr(0, RefPos); - } - - RecipIter Iter = RecipMap.find(Val); - if (Iter == RecipMap.end()) { - // Try again specifying float suffix. - Iter = RecipMap.find(Val.str() + 'f'); - if (Iter == RecipMap.end()) { - Iter = RecipMap.find(Val.str() + 'd'); - assert(Iter == RecipMap.end() && "Float entry missing from map"); - report_fatal_error("Invalid option for -recip."); - } - - // The option was specified without a float or double suffix. - if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) { - // Make sure that the double entry was not already specified. - // The float entry will be checked below. - report_fatal_error("Duplicate option for -recip."); - } - } - - if (Iter->second.Enabled != Uninitialized) - report_fatal_error("Duplicate option for -recip."); - - // Mark the matched option as found. Do not allow duplicate specifiers. - Iter->second.Enabled = !IsDisabled; - if (!RefStepString.empty()) - Iter->second.RefinementSteps = RefSteps; - - // If the precision was not specified, the double entry is also initialized. - if (Val.back() != 'f' && Val.back() != 'd') { - RecipMap[Val.str() + 'd'].Enabled = !IsDisabled; - if (!RefStepString.empty()) - RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps; - } - } -} - -TargetRecip::TargetRecip(const std::vector<std::string> &Args) : - TargetRecip() { - unsigned NumArgs = Args.size(); - - // Check if "all", "default", or "none" was specified. - if (NumArgs == 1 && parseGlobalParams(Args[0])) - return; - - parseIndividualParams(Args); -} - -bool TargetRecip::isEnabled(const StringRef &Key) const { - ConstRecipIter Iter = RecipMap.find(Key); - assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); - assert(Iter->second.Enabled != Uninitialized && - "Enablement setting was not initialized"); - return Iter->second.Enabled; -} - -unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { - ConstRecipIter Iter = RecipMap.find(Key); - assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); - assert(Iter->second.RefinementSteps != Uninitialized && - "Refinement step setting was not initialized"); - return Iter->second.RefinementSteps; -} - -/// Custom settings (previously initialized values) override target defaults. -void TargetRecip::setDefaults(const StringRef &Key, bool Enable, - unsigned RefSteps) { - if (Key == "all") { - for (auto &KV : RecipMap) { - RecipParams &RP = KV.second; - if (RP.Enabled == Uninitialized) - RP.Enabled = Enable; - if (RP.RefinementSteps == Uninitialized) - RP.RefinementSteps = RefSteps; - } - } else { - RecipParams &RP = RecipMap[Key]; - if (RP.Enabled == Uninitialized) - RP.Enabled = Enable; - if (RP.RefinementSteps == Uninitialized) - RP.RefinementSteps = RefSteps; - } -} - -bool TargetRecip::operator==(const TargetRecip &Other) const { - for (const auto &KV : RecipMap) { - const StringRef &Op = KV.first; - const RecipParams &RP = KV.second; - const RecipParams &OtherRP = Other.RecipMap.find(Op)->second; - if (RP.RefinementSteps != OtherRP.RefinementSteps) - return false; - if (RP.Enabled != OtherRP.Enabled) - return false; - } - return true; -} diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 7001910a61e..c70e2e95463 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -188,6 +188,10 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; +def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", + "true", "Use RCP* to optimize division calculations">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; @@ -440,7 +444,7 @@ def : ProcessorModel<"btver2", BtVer2Model, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD]>; + FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 248c452d544..a4787c81661 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -67,6 +67,12 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); +static cl::opt<int> ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -13000,31 +13006,29 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + EVT VT = Op.getValueType(); - const char *RecipOp; - // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. + // SSE1 has rsqrtss and rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) - RecipOp = "sqrtf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) - RecipOp = "vec-sqrtf"; - else - return SDValue(); - - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); - - RefinementSteps = Recips.getRefinementSteps(RecipOp); - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -13032,9 +13036,15 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + EVT VT = Op.getValueType(); - const char *RecipOp; - + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -13042,20 +13052,12 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) - RecipOp = "divf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) - RecipOp = "vec-divf"; - else - return SDValue(); - - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); - - RefinementSteps = Recips.getRefinementSteps(RecipOp); - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = ReciprocalEstimateRefinementSteps; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 65e702e7f35..1cdab14e034 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -273,6 +273,8 @@ void X86Subtarget::initializeEnvironment() { LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; + UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 27429d050bd..455dd7744d7 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -190,6 +190,16 @@ protected: /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + + /// Use the RCP* instructions to optimize FP division calculations. + /// For this to be profitable, the cost of FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseReciprocalEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -367,6 +377,8 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } + bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 646cff7c5bd..4116cef9b07 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -105,13 +105,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; - // TODO: By default, all reciprocal estimate operations are off because - // that matches the behavior before TargetRecip was added (except for btver2 - // which used subtarget features to enable this type of codegen). - // We should change this to match GCC behavior where everything but - // scalar division estimates are turned on by default with -ffast-math. - this->Options.Reciprocals.setDefaults("all", false, 1); - initAsmInfo(); } |