diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-05-22 21:10:06 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-05-22 21:10:06 +0000 |
commit | ba2ba80302182bcf1773c560b592949cd9a1d501 (patch) | |
tree | 925620c95192cb142122963aa9512b1676a00243 /llvm/lib/Target | |
parent | b2f6afb30ac3285f2d2db310ed283398860f12fc (diff) | |
download | bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.tar.gz bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.zip |
make reciprocal estimate code generation more flexible by adding command-line options
This patch adds a class for processing many recip codegen possibilities.
The TargetRecip class is intended to handle both command-line options to llc as well
as options passed in from a front-end such as clang with the -mrecip option.
The x86 backend is updated to use the new functionality.
Only -mcpu=btver2 with -ffast-math should see a functional change from this patch.
All other CPUs continue to *not* use reciprocal estimates by default with -ffast-math.
Differential Revision: http://reviews.llvm.org/D8982
llvm-svn: 238051
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/TargetRecip.cpp | 225 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 68 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 12 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 7 |
7 files changed, 267 insertions, 54 deletions
diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt index 1805437b12f..e6d0199952f 100644 --- a/llvm/lib/Target/CMakeLists.txt +++ b/llvm/lib/Target/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_library(LLVMTarget TargetLoweringObjectFile.cpp TargetMachine.cpp TargetMachineC.cpp + TargetRecip.cpp TargetSubtargetInfo.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Target/TargetRecip.cpp b/llvm/lib/Target/TargetRecip.cpp new file mode 100644 index 00000000000..42bc487fe6d --- /dev/null +++ b/llvm/lib/Target/TargetRecip.cpp @@ -0,0 +1,225 @@ +//===-------------------------- TargetRecip.cpp ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is used to customize machine-specific reciprocal estimate code +// generation in a target-independent way. +// If a target does not support operations in this specification, then code +// generation will default to using supported operations. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetRecip.h" +#include <map> + +using namespace llvm; + +// These are the names of the individual reciprocal operations. These are +// the key strings for queries and command-line inputs. +// In addition, the command-line interface recognizes the global parameters +// "all", "none", and "default". +static const char *RecipOps[] = { + "divd", + "divf", + "vec-divd", + "vec-divf", + "sqrtd", + "sqrtf", + "vec-sqrtd", + "vec-sqrtf", +}; + +// The uninitialized state is needed for the enabled settings and refinement +// steps because custom settings may arrive via the command-line before target +// defaults are set. +TargetRecip::TargetRecip() { + unsigned NumStrings = llvm::array_lengthof(RecipOps); + for (unsigned i = 0; i < NumStrings; ++i) + RecipMap.insert(std::make_pair(RecipOps[i], RecipParams())); +} + +static bool parseRefinementStep(const StringRef &In, size_t &Position, + uint8_t &Value) { + const char RefStepToken = ':'; + Position = In.find(RefStepToken); + if (Position == StringRef::npos) + return false; + + StringRef RefStepString = In.substr(Position + 1); + // Allow exactly one numeric character for the additional refinement + // step parameter. + if (RefStepString.size() == 1) { + char RefStepChar = RefStepString[0]; + if (RefStepChar >= '0' && RefStepChar <= '9') { + Value = RefStepChar - '0'; + return true; + } + } + report_fatal_error("Invalid refinement step for -recip."); +} + +bool TargetRecip::parseGlobalParams(const std::string &Arg) { + StringRef ArgSub = Arg; + + // Look for an optional setting of the number of refinement steps needed + // for this type of reciprocal operation. + size_t RefPos; + uint8_t RefSteps; + StringRef RefStepString; + if (parseRefinementStep(ArgSub, RefPos, RefSteps)) { + // Split the string for further processing. + RefStepString = ArgSub.substr(RefPos + 1); + ArgSub = ArgSub.substr(0, RefPos); + } + bool Enable; + bool UseDefaults; + if (ArgSub == "all") { + UseDefaults = false; + Enable = true; + } else if (ArgSub == "none") { + UseDefaults = false; + Enable = false; + } else if (ArgSub == "default") { + UseDefaults = true; + } else { + // Any other string is invalid or an individual setting. + return false; + } + + // All enable values will be initialized to target defaults if 'default' was + // specified. + if (!UseDefaults) + for (auto &KV : RecipMap) + KV.second.Enabled = Enable; + + // Custom refinement count was specified with all, none, or default. + if (!RefStepString.empty()) + for (auto &KV : RecipMap) + KV.second.RefinementSteps = RefSteps; + + return true; +} + +void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) { + static const char DisabledPrefix = '!'; + unsigned NumArgs = Args.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + StringRef Val = Args[i]; + + bool IsDisabled = Val[0] == DisabledPrefix; + // Ignore the disablement token for string matching. + if (IsDisabled) + Val = Val.substr(1); + + size_t RefPos; + uint8_t RefSteps; + StringRef RefStepString; + if (parseRefinementStep(Val, RefPos, RefSteps)) { + // Split the string for further processing. + RefStepString = Val.substr(RefPos + 1); + Val = Val.substr(0, RefPos); + } + + RecipIter Iter = RecipMap.find(Val); + if (Iter == RecipMap.end()) { + // Try again specifying float suffix. + Iter = RecipMap.find(Val.str() + 'f'); + if (Iter == RecipMap.end()) { + Iter = RecipMap.find(Val.str() + 'd'); + assert(Iter == RecipMap.end() && "Float entry missing from map"); + report_fatal_error("Invalid option for -recip."); + } + + // The option was specified without a float or double suffix. + if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) { + // Make sure that the double entry was not already specified. + // The float entry will be checked below. + report_fatal_error("Duplicate option for -recip."); + } + } + + if (Iter->second.Enabled != Uninitialized) + report_fatal_error("Duplicate option for -recip."); + + // Mark the matched option as found. Do not allow duplicate specifiers. + Iter->second.Enabled = !IsDisabled; + if (!RefStepString.empty()) + Iter->second.RefinementSteps = RefSteps; + + // If the precision was not specified, the double entry is also initialized. + if (Val.back() != 'f' && Val.back() != 'd') { + RecipMap[Val.str() + 'd'].Enabled = !IsDisabled; + if (!RefStepString.empty()) + RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps; + } + } +} + +TargetRecip::TargetRecip(const std::vector<std::string> &Args) : + TargetRecip() { + unsigned NumArgs = Args.size(); + + // Check if "all", "default", or "none" was specified. + if (NumArgs == 1 && parseGlobalParams(Args[0])) + return; + + parseIndividualParams(Args); +} + +bool TargetRecip::isEnabled(const StringRef &Key) const { + ConstRecipIter Iter = RecipMap.find(Key); + assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); + assert(Iter->second.Enabled != Uninitialized && + "Enablement setting was not initialized"); + return Iter->second.Enabled; +} + +unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { + ConstRecipIter Iter = RecipMap.find(Key); + assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); + assert(Iter->second.RefinementSteps != Uninitialized && + "Refinement step setting was not initialized"); + return Iter->second.RefinementSteps; +} + +/// Custom settings (previously initialized values) override target defaults. +void TargetRecip::setDefaults(const StringRef &Key, bool Enable, + unsigned RefSteps) { + if (Key == "all") { + for (auto &KV : RecipMap) { + RecipParams &RP = KV.second; + if (RP.Enabled == Uninitialized) + RP.Enabled = Enable; + if (RP.RefinementSteps == Uninitialized) + RP.RefinementSteps = RefSteps; + } + } else { + RecipParams &RP = RecipMap[Key]; + if (RP.Enabled == Uninitialized) + RP.Enabled = Enable; + if (RP.RefinementSteps == Uninitialized) + RP.RefinementSteps = RefSteps; + } +} + +bool TargetRecip::operator==(const TargetRecip &Other) const { + for (const auto &KV : RecipMap) { + const StringRef &Op = KV.first; + const RecipParams &RP = KV.second; + const RecipParams &OtherRP = Other.RecipMap.find(Op)->second; + if (RP.RefinementSteps != OtherRP.RefinementSteps) + return false; + if (RP.Enabled != OtherRP.Enabled) + return false; + } + return true; +} diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index c70e2e95463..7001910a61e 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -188,10 +188,6 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; -def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", - "Use RSQRT* to optimize square root calculations">; -def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", - "true", "Use RCP* to optimize division calculations">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; @@ -444,7 +440,7 @@ def : ProcessorModel<"btver2", BtVer2Model, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + FeatureSlowSHLD]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e850e500db6..609973f8971 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -static cl::opt<int> ReciprocalEstimateRefinementSteps( - "x86-recip-refinement-steps", cl::init(1), - cl::desc("Specify the number of Newton-Raphson iterations applied to the " - "result of the hardware reciprocal estimate instruction."), - cl::NotHidden); - // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -12901,29 +12895,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor and/or sqrt operand. - if (!Subtarget->useSqrtEst()) - return SDValue(); - EVT VT = Op.getValueType(); + const char *RecipOp; - // SSE1 has rsqrtss and rsqrtps. + // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = 1; - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "sqrtf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-sqrtf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -12931,15 +12927,9 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor. - if (!Subtarget->useReciprocalEst()) - return SDValue(); - EVT VT = Op.getValueType(); - + const char *RecipOp; + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -12947,12 +12937,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = ReciprocalEstimateRefinementSteps; - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "divf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-divf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } /// If we have at least two divisions that use the same divisor, convert to diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 1cdab14e034..65e702e7f35 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -273,8 +273,6 @@ void X86Subtarget::initializeEnvironment() { LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; - UseSqrtEst = false; - UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 455dd7744d7..27429d050bd 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -190,16 +190,6 @@ protected: /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; - /// Use the RSQRT* instructions to optimize square root calculations. - /// For this to be profitable, the cost of FSQRT and FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseSqrtEst; - - /// Use the RCP* instructions to optimize FP division calculations. - /// For this to be profitable, the cost of FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseReciprocalEst; - /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -377,8 +367,6 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } - bool useSqrtEst() const { return UseSqrtEst; } - bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 3e5f1d82202..33576f1771f 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; + // TODO: By default, all reciprocal estimate operations are off because + // that matches the behavior before TargetRecip was added (except for btver2 + // which used subtarget features to enable this type of codegen). + // We should change this to match GCC behavior where everything but + // scalar division estimates are turned on by default with -ffast-math. + this->Options.Reciprocals.setDefaults("all", false, 1); + initAsmInfo(); } |