make reciprocal estimate code generation more flexible by adding command-line options

This patch adds a class for processing many recip codegen possibilities. The TargetRecip class is intended to handle both command-line options to llc as well as options passed in from a front-end such as clang with the -mrecip option. The x86 backend is updated to use the new functionality. Only -mcpu=btver2 with -ffast-math should see a functional change from this patch. All other CPUs continue to *not* use reciprocal estimates by default with -ffast-math. Differential Revision: http://reviews.llvm.org/D8982 llvm-svn: 238051
author: Sanjay Patel <spatel@rotateright.com> 2015-05-22 21:10:06 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2015-05-22 21:10:06 +0000
commit: ba2ba80302182bcf1773c560b592949cd9a1d501 (patch)
tree: 925620c95192cb142122963aa9512b1676a00243 /llvm/lib/Target
parent: b2f6afb30ac3285f2d2db310ed283398860f12fc (diff)
download: bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.tar.gz
bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.zip
7 files changed, 267 insertions, 54 deletions
diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
index 1805437b12f..e6d0199952f 100644
--- a/llvm/lib/Target/CMakeLists.txt
+++ b/llvm/lib/Target/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMTarget
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp
   TargetMachineC.cpp
+  TargetRecip.cpp
   TargetSubtargetInfo.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/Target/TargetRecip.cpp b/llvm/lib/Target/TargetRecip.cpp
new file mode 100644
index 00000000000..42bc487fe6d
--- /dev/null
+++ b/llvm/lib/Target/TargetRecip.cpp
@@ -0,0 +1,225 @@
+//===-------------------------- TargetRecip.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is used to customize machine-specific reciprocal estimate code
+// generation in a target-independent way.
+// If a target does not support operations in this specification, then code
+// generation will default to using supported operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRecip.h"
+#include <map>
+
+using namespace llvm;
+
+// These are the names of the individual reciprocal operations. These are
+// the key strings for queries and command-line inputs.
+// In addition, the command-line interface recognizes the global parameters
+// "all", "none", and "default".
+static const char *RecipOps[] = {
+  "divd",
+  "divf",
+  "vec-divd",
+  "vec-divf",
+  "sqrtd",
+  "sqrtf",
+  "vec-sqrtd",
+  "vec-sqrtf",
+};
+
+// The uninitialized state is needed for the enabled settings and refinement
+// steps because custom settings may arrive via the command-line before target
+// defaults are set.
+TargetRecip::TargetRecip() {
+  unsigned NumStrings = llvm::array_lengthof(RecipOps);
+  for (unsigned i = 0; i < NumStrings; ++i)
+    RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
+}
+
+static bool parseRefinementStep(const StringRef &In, size_t &Position,
+                                uint8_t &Value) {
+  const char RefStepToken = ':';
+  Position = In.find(RefStepToken);
+  if (Position == StringRef::npos)
+    return false;
+
+  StringRef RefStepString = In.substr(Position + 1);
+  // Allow exactly one numeric character for the additional refinement
+  // step parameter.
+  if (RefStepString.size() == 1) {
+    char RefStepChar = RefStepString[0];
+    if (RefStepChar >= '0' && RefStepChar <= '9') {
+      Value = RefStepChar - '0';
+      return true;
+    }
+  }
+  report_fatal_error("Invalid refinement step for -recip.");
+}
+
+bool TargetRecip::parseGlobalParams(const std::string &Arg) {
+  StringRef ArgSub = Arg;
+
+  // Look for an optional setting of the number of refinement steps needed
+  // for this type of reciprocal operation.
+  size_t RefPos;
+  uint8_t RefSteps;
+  StringRef RefStepString;
+  if (parseRefinementStep(ArgSub, RefPos, RefSteps)) {
+    // Split the string for further processing.
+    RefStepString = ArgSub.substr(RefPos + 1);
+    ArgSub = ArgSub.substr(0, RefPos);
+  }
+  bool Enable;
+  bool UseDefaults;
+  if (ArgSub == "all") {
+    UseDefaults = false;
+    Enable = true;
+  } else if (ArgSub == "none") {
+    UseDefaults = false;
+    Enable = false;
+  } else if (ArgSub == "default") {
+    UseDefaults = true;
+  } else {
+    // Any other string is invalid or an individual setting.
+    return false;
+  }
+
+  // All enable values will be initialized to target defaults if 'default' was
+  // specified.
+  if (!UseDefaults)
+    for (auto &KV : RecipMap)
+      KV.second.Enabled = Enable;
+
+  // Custom refinement count was specified with all, none, or default.
+  if (!RefStepString.empty())
+    for (auto &KV : RecipMap)
+      KV.second.RefinementSteps = RefSteps;
+  
+  return true;
+}
+
+void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) {
+  static const char DisabledPrefix = '!';
+  unsigned NumArgs = Args.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    StringRef Val = Args[i];
+    
+    bool IsDisabled = Val[0] == DisabledPrefix;
+    // Ignore the disablement token for string matching.
+    if (IsDisabled)
+      Val = Val.substr(1);
+    
+    size_t RefPos;
+    uint8_t RefSteps;
+    StringRef RefStepString;
+    if (parseRefinementStep(Val, RefPos, RefSteps)) {
+      // Split the string for further processing.
+      RefStepString = Val.substr(RefPos + 1);
+      Val = Val.substr(0, RefPos);
+    }
+
+    RecipIter Iter = RecipMap.find(Val);
+    if (Iter == RecipMap.end()) {
+      // Try again specifying float suffix.
+      Iter = RecipMap.find(Val.str() + 'f');
+      if (Iter == RecipMap.end()) {
+        Iter = RecipMap.find(Val.str() + 'd');
+        assert(Iter == RecipMap.end() && "Float entry missing from map");
+        report_fatal_error("Invalid option for -recip.");
+      }
+      
+      // The option was specified without a float or double suffix.
+      if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
+        // Make sure that the double entry was not already specified.
+        // The float entry will be checked below.
+        report_fatal_error("Duplicate option for -recip.");
+      }
+    }
+    
+    if (Iter->second.Enabled != Uninitialized)
+      report_fatal_error("Duplicate option for -recip.");
+    
+    // Mark the matched option as found. Do not allow duplicate specifiers.
+    Iter->second.Enabled = !IsDisabled;
+    if (!RefStepString.empty())
+      Iter->second.RefinementSteps = RefSteps;
+    
+    // If the precision was not specified, the double entry is also initialized.
+    if (Val.back() != 'f' && Val.back() != 'd') {
+      RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
+      if (!RefStepString.empty())
+        RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
+    }
+  }
+}
+
+TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
+  TargetRecip() {
+  unsigned NumArgs = Args.size();
+
+  // Check if "all", "default", or "none" was specified.
+  if (NumArgs == 1 && parseGlobalParams(Args[0]))
+    return;
+ 
+  parseIndividualParams(Args);
+}
+
+bool TargetRecip::isEnabled(const StringRef &Key) const {
+  ConstRecipIter Iter = RecipMap.find(Key);
+  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
+  assert(Iter->second.Enabled != Uninitialized &&
+         "Enablement setting was not initialized");
+  return Iter->second.Enabled;
+}
+
+unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
+  ConstRecipIter Iter = RecipMap.find(Key);
+  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
+  assert(Iter->second.RefinementSteps != Uninitialized &&
+         "Refinement step setting was not initialized");
+  return Iter->second.RefinementSteps;
+}
+
+/// Custom settings (previously initialized values) override target defaults.
+void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
+                              unsigned RefSteps) {
+  if (Key == "all") {
+    for (auto &KV : RecipMap) {
+      RecipParams &RP = KV.second;
+      if (RP.Enabled == Uninitialized)
+        RP.Enabled = Enable;
+      if (RP.RefinementSteps == Uninitialized)
+        RP.RefinementSteps = RefSteps;
+    }
+  } else {
+    RecipParams &RP = RecipMap[Key];
+    if (RP.Enabled == Uninitialized)
+      RP.Enabled = Enable;
+    if (RP.RefinementSteps == Uninitialized)
+      RP.RefinementSteps = RefSteps;
+  }
+}
+
+bool TargetRecip::operator==(const TargetRecip &Other) const {
+  for (const auto &KV : RecipMap) {
+    const StringRef &Op = KV.first;
+    const RecipParams &RP = KV.second;
+    const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
+    if (RP.RefinementSteps != OtherRP.RefinementSteps)
+      return false;
+    if (RP.Enabled != OtherRP.Enabled)
+      return false;
+  }
+  return true;
+}
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index c70e2e95463..7001910a61e 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -188,10 +188,6 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
-def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
-                            "Use RSQRT* to optimize square root calculations">;
-def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
-                          "true", "Use RCP* to optimize division calculations">;
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
@@ -444,7 +440,7 @@ def : ProcessorModel<"btver2", BtVer2Model,
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
                       FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
-                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
+                      FeatureSlowSHLD]>;
 
 // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e850e500db6..609973f8971 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
-static cl::opt<int> ReciprocalEstimateRefinementSteps(
-    "x86-recip-refinement-steps", cl::init(1),
-    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
-             "result of the hardware reciprocal estimate instruction."),
-    cl::NotHidden);
-
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -12901,29 +12895,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps,
                                             bool &UseOneConstNR) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor and/or sqrt operand.
-  if (!Subtarget->useSqrtEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
+  const char *RecipOp;
 
-  // SSE1 has rsqrtss and rsqrtps.
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = 1;
-    UseOneConstNR = false;
-    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "sqrtf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-sqrtf";
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+  
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
@@ -12931,15 +12927,9 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor.
-  if (!Subtarget->useReciprocalEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
-
+  const char *RecipOp;
+  
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
@@ -12947,12 +12937,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = ReciprocalEstimateRefinementSteps;
-    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "divf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-divf";
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 1cdab14e034..65e702e7f35 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -273,8 +273,6 @@ void X86Subtarget::initializeEnvironment() {
   LEAUsesAG = false;
   SlowLEA = false;
   SlowIncDec = false;
-  UseSqrtEst = false;
-  UseReciprocalEst = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 455dd7744d7..27429d050bd 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -190,16 +190,6 @@ protected:
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
-  /// Use the RSQRT* instructions to optimize square root calculations.
-  /// For this to be profitable, the cost of FSQRT and FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseSqrtEst;
-
-  /// Use the RCP* instructions to optimize FP division calculations.
-  /// For this to be profitable, the cost of FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseReciprocalEst;
-
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -377,8 +367,6 @@ public:
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
-  bool useSqrtEst() const { return UseSqrtEst; }
-  bool useReciprocalEst() const { return UseReciprocalEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 3e5f1d82202..33576f1771f 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
   if (Subtarget.isTargetWin64())
     this->Options.TrapUnreachable = true;
 
+  // TODO: By default, all reciprocal estimate operations are off because
+  // that matches the behavior before TargetRecip was added (except for btver2
+  // which used subtarget features to enable this type of codegen).
+  // We should change this to match GCC behavior where everything but
+  // scalar division estimates are turned on by default with -ffast-math.
+  this->Options.Reciprocals.setDefaults("all", false, 1);
+
   initAsmInfo();
 }
author	Sanjay Patel <spatel@rotateright.com>	2015-05-22 21:10:06 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2015-05-22 21:10:06 +0000
commit	ba2ba80302182bcf1773c560b592949cd9a1d501 (patch)
tree	925620c95192cb142122963aa9512b1676a00243 /llvm/lib/Target
parent	b2f6afb30ac3285f2d2db310ed283398860f12fc (diff)
download	bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.tar.gz bcm5719-llvm-ba2ba80302182bcf1773c560b592949cd9a1d501.zip