[X86] Disable BMI BEXTR in X86DAGToDAGISel::matchBEXTRFromAnd unless we're on compiling for a CPU with single uop BEXTR

Summary: This function turns (X >> C1) & C2 into a BMI BEXTR or TBM BEXTRI instruction. For BMI BEXTR we have to materialize an immediate into a register to feed to the BEXTR instruction. The BMI BEXTR instruction is 2 uops on Intel CPUs. It looks like on SKL its one port 0/6 uop and one port 1/5 uop. Despite what Agner's tables say. I know one of the uops is a regular shift uop so it would have to go through the port 0/6 shifter unit. So that's the same or worse execution wise than the shift+and which is one 0/6 uop and one 0/1/5/6 uop. The move immediate into register is an additional 0/1/5/6 uop. For now I've limited this transform to AMD CPUs which have a single uop BEXTR. If may also might make sense if we can fold a load or if the and immediate is larger than 32-bits and can't be encoded as a sign extended 32-bit value or if LICM or CSE can hoist the move immediate and share it. But we'd need to look more carefully at that. In the regression I looked at it doesn't look load folding or large immediates were occurring so the regression isn't caused by the loss of those. So we could try to be smarter here if we find a compelling case. Reviewers: RKSimon, spatel, lebedev.ri, andreadb Reviewed By: RKSimon Subscribers: llvm-commits, andreadb, RKSimon Differential Revision: https://reviews.llvm.org/D52570 llvm-svn: 343399
author: Craig Topper <craig.topper@intel.com> 2018-09-30 03:01:46 +0000
committer: Craig Topper <craig.topper@intel.com> 2018-09-30 03:01:46 +0000
commit: 1709829fede301a390163480ef51f0e140b0d028 (patch)
tree: aed77e311f7ef31b3d20068937e902f9671af3a2 /llvm/lib
parent: 6e6d545d2488b070c32a9c7e21cf4a1e5c6d4683 (diff)
download: bcm5719-llvm-1709829fede301a390163480ef51f0e140b0d028.tar.gz
bcm5719-llvm-1709829fede301a390163480ef51f0e140b0d028.zip
3 files changed, 21 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index e5d5d929be9..2c48b54c380 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -400,6 +400,10 @@ def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
 def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
                                         "Support movdir64b instruction">;
 
+def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
+          "Indicates that the BEXTR instruction is implemented as a single uop "
+          "with good throughput.">;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -987,6 +991,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
   FeatureFast15ByteNOP,
+  FeatureFastBEXTR,
   FeatureFastPartialYMMorZMMWrite
 ]>;
 
@@ -1042,6 +1047,7 @@ def : Proc<"bdver2", [
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
   FeatureFast11ByteNOP,
+  FeatureFastBEXTR,
   FeatureMacroFusion
 ]>;
 
@@ -1074,6 +1080,7 @@ def : Proc<"bdver3", [
   FeatureFSGSBase,
   FeatureLAHFSAHF,
   FeatureFast11ByteNOP,
+  FeatureFastBEXTR,
   FeatureMacroFusion
 ]>;
 
@@ -1105,6 +1112,7 @@ def : Proc<"bdver4", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
+  FeatureFastBEXTR,
   FeatureFast11ByteNOP,
   FeatureMWAITX,
   FeatureMacroFusion
@@ -1130,6 +1138,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
   FeatureFastLZCNT,
   FeatureLAHFSAHF,
   FeatureLZCNT,
+  FeatureFastBEXTR,
   FeatureFast15ByteNOP,
   FeatureMacroFusion,
   FeatureMMX,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32ad262e558..a0ef4b61263 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2590,7 +2590,14 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
   SDValue N0 = Node->getOperand(0);
   SDValue N1 = Node->getOperand(1);
 
-  if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
+  // If we have TBM we can use an immediate for the control. If we have BMI
+  // we should only do this if the BEXTR instruction is implemented well.
+  // Otherwise moving the control into a register makes this more costly.
+  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
+  // hoisting the move immediate would make it worthwhile with a less optimal
+  // BEXTR?
+  if (!Subtarget->hasTBM() &&
+      !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
     return false;
 
   // Must have a shift right.
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 82ff9420b17..5dd406b1400 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -385,6 +385,9 @@ protected:
   /// Processor supports PCONFIG instruction
   bool HasPCONFIG = false;
 
+  /// Processor has a single uop BEXTR implementation.
+  bool HasFastBEXTR = false;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpolineIndirectCalls = false;
@@ -629,6 +632,7 @@ public:
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasFastBEXTR() const { return HasFastBEXTR; }
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
author	Craig Topper <craig.topper@intel.com>	2018-09-30 03:01:46 +0000
committer	Craig Topper <craig.topper@intel.com>	2018-09-30 03:01:46 +0000
commit	1709829fede301a390163480ef51f0e140b0d028 (patch)
tree	aed77e311f7ef31b3d20068937e902f9671af3a2 /llvm/lib
parent	6e6d545d2488b070c32a9c7e21cf4a1e5c6d4683 (diff)
download	bcm5719-llvm-1709829fede301a390163480ef51f0e140b0d028.tar.gz bcm5719-llvm-1709829fede301a390163480ef51f0e140b0d028.zip