[X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets (PR40758)

As detailed on PR40758, Bobcat/Jaguar can perform vector immediate shifts on the same pipes as vector ANDs with the same latency - so it doesn't make sense to replace a shl+lshr with a shift+and pair as it requires an additional mask (with the extra constant pool, loading and register pressure costs). Differential Revision: https://reviews.llvm.org/D61068 llvm-svn: 359293
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-04-26 10:49:13 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-04-26 10:49:13 +0000
commit: 5d6ef94c369a3ea804d8377f16ba8878160bd9a6 (patch)
tree: c0e2d5896961dabe274f9d136051971f2b553007 /llvm/lib
parent: 5e161df9f8999c7570fdf9477d51d33a3e288f5a (diff)
download: bcm5719-llvm-5d6ef94c369a3ea804d8377f16ba8878160bd9a6.tar.gz
bcm5719-llvm-5d6ef94c369a3ea804d8377f16ba8878160bd9a6.zip
4 files changed, 26 insertions, 2 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6d351cc57ac..e8bad0413b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6882,6 +6882,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   //                               (and (srl x, (sub c1, c2), MASK)
   // Only fold this if the inner shift has no other uses -- if it does, folding
   // this will increase the total number of instructions.
+  // TODO - drop hasOneUse requirement if c1 == c2?
+  // TODO - support non-uniform vector shift amounts.
   if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
       TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
     if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
@@ -7188,6 +7190,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   }
 
   // fold (srl (shl x, c), c) -> (and x, cst2)
+  // TODO - (srl (shl x, c1), c2).
   if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
     SDLoc DL(N);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index c054379acf7..fe23a2900d5 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -424,6 +424,11 @@ def FeatureFastHorizontalOps
         "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
         "normal vector instructions with shuffles", [FeatureSSE3]>;
 
+def FeatureFastVectorShiftMasks
+    : SubtargetFeature<
+        "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
+        "Prefer a left/right vector logical shift pair over a shift+and pair">;
+
 // Merge branches using three-way conditional code.
 def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
                                         "ThreewayBranchProfitable", "true",
@@ -775,7 +780,8 @@ def ProcessorFeatures {
                                                       FeaturePOPCNT,
                                                       FeatureSlowSHLD,
                                                       FeatureLAHFSAHF,
-                                                      FeatureFast15ByteNOP];
+                                                      FeatureFast15ByteNOP,
+                                                      FeatureFastVectorShiftMasks];
   list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
 
   // Jaguar
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e91724c3f32..889a0111b87 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5013,7 +5013,18 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
 
 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
-  // TODO - some targets prefer immediate vector shifts to shift+mask.
+  assert((N->getOpcode() == ISD::SHL &&
+          N->getOperand(0).getOpcode() == ISD::SRL) ||
+         (N->getOpcode() == ISD::SRL &&
+          N->getOperand(0).getOpcode() == ISD::SHL) &&
+             "Expected shift-shift mask");
+
+  if (Subtarget.hasFastVectorShiftMasks() && N->getValueType(0).isVector()) {
+    // Only fold if the shift values are equal - so it folds to AND.
+    // TODO - we should fold if either is non-uniform but we don't do the
+    // fold for non-splats yet.
+    return N->getOperand(1) == N->getOperand(0).getOperand(1);
+  }
   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
 }
 
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index fe04f022070..0ff9d544d82 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -393,6 +393,9 @@ protected:
   /// Try harder to combine to horizontal vector ops if they are fast.
   bool HasFastHorizontalOps = false;
 
+  /// Prefer a left/right vector logical shifts pair over a shift+and pair.
+  bool HasFastVectorShiftMasks = false;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpolineIndirectCalls = false;
@@ -644,6 +647,7 @@ public:
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasFastBEXTR() const { return HasFastBEXTR; }
   bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
+  bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasBranchFusion() const { return HasBranchFusion; }
   bool hasERMSB() const { return HasERMSB; }
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-04-26 10:49:13 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-04-26 10:49:13 +0000
commit	5d6ef94c369a3ea804d8377f16ba8878160bd9a6 (patch)
tree	c0e2d5896961dabe274f9d136051971f2b553007 /llvm/lib
parent	5e161df9f8999c7570fdf9477d51d33a3e288f5a (diff)
download	bcm5719-llvm-5d6ef94c369a3ea804d8377f16ba8878160bd9a6.tar.gz bcm5719-llvm-5d6ef94c369a3ea804d8377f16ba8878160bd9a6.zip