[x86] lower extracted add/sub to horizontal vector math

add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 This is the integer sibling to D56011. There's an additional restriction to only to do this transform in the case where we don't have extra extracts from the source vector. Without that, we can fail to match larger horizontal patterns that are more beneficial than this minimal case. An improvement to the more general h-op lowering may allow us to remove the restriction here in a follow-up. llvm-svn: 351093
author: Sanjay Patel <spatel@rotateright.com> 2019-01-14 18:44:02 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2019-01-14 18:44:02 +0000
commit: b23ff7a0e251edffab75a498978224276bc1ebd0 (patch)
tree: 85279606a44020e52464f2ff26f1b025f4842b2e /llvm/lib
parent: 1b4623176447a8242acbcc509441dd2585f0815d (diff)
download: bcm5719-llvm-b23ff7a0e251edffab75a498978224276bc1ebd0.tar.gz
bcm5719-llvm-b23ff7a0e251edffab75a498978224276bc1ebd0.zip
1 files changed, 55 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b2cd55c61af..e2e63eb4c18 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1013,6 +1013,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
+
+    // These might be better off as horizontal vector ops.
+    setOperationAction(ISD::ADD,                MVT::i16, Custom);
+    setOperationAction(ISD::ADD,                MVT::i32, Custom);
+    setOperationAction(ISD::SUB,                MVT::i16, Custom);
+    setOperationAction(ISD::SUB,                MVT::i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
@@ -18487,21 +18493,28 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
 /// vector operation in place of the typical scalar operation.
-static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
-  MVT VT = Op.getSimpleValueType();
-  assert((VT == MVT::f32 || VT == MVT::f64) && "Only expecting float/double");
-
+static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
   // If both operands have other uses, this is probably not profitable.
-  // Horizontal FP add/sub were added with SSE3.
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  if ((!LHS.hasOneUse() && !RHS.hasOneUse()) || !Subtarget.hasSSE3())
+  if (!LHS.hasOneUse() && !RHS.hasOneUse())
     return Op;
 
+  // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
+  bool IsFP = Op.getSimpleValueType().isFloatingPoint();
+  if (IsFP && !Subtarget.hasSSE3())
+    return Op;
+  if (!IsFP && !Subtarget.hasSSSE3())
+    return Op;
+
+  // Defer forming the minimal horizontal op if the vector source has more than
+  // the 2 extract element uses that we're matching here. In that case, we might
+  // form a horizontal op that includes more than 1 add/sub op.
   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      LHS.getOperand(0) != RHS.getOperand(0))
+      LHS.getOperand(0) != RHS.getOperand(0) ||
+      !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
     return Op;
 
   if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
@@ -18510,13 +18523,23 @@ static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
     return Op;
 
   // Allow commuted 'hadd' ops.
-  // TODO: Allow commuted fsub by negating the result of FHSUB?
-  // TODO: This can be extended to handle other adjacent extract pairs.
-  auto HOpcode = Op.getOpcode() == ISD::FADD ? X86ISD::FHADD : X86ISD::FHSUB;
+  // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
+  unsigned HOpcode;
+  switch (Op.getOpcode()) {
+    case ISD::ADD: HOpcode = X86ISD::HADD; break;
+    case ISD::SUB: HOpcode = X86ISD::HSUB; break;
+    case ISD::FADD: HOpcode = X86ISD::FHADD; break;
+    case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
+    default:
+      llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
+  }
   unsigned LExtIndex = LHS.getConstantOperandVal(1);
   unsigned RExtIndex = RHS.getConstantOperandVal(1);
-  if (LExtIndex == 1 && RExtIndex == 0 && HOpcode == X86ISD::FHADD)
+  if (LExtIndex == 1 && RExtIndex == 0 &&
+      (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
     std::swap(LExtIndex, RExtIndex);
+
+  // TODO: This can be extended to handle other adjacent extract pairs.
   if (LExtIndex != 0 || RExtIndex != 1)
     return Op;
 
@@ -18533,15 +18556,23 @@ static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
   if (BitWidth == 256 || BitWidth == 512)
     X = extract128BitVector(X, 0, DAG, DL);
 
-  // fadd (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
-  // fadd (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
-  // fsub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
-  // The extract of element 0 is free: the scalar result is element 0.
+  // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
+  // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+  // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, HOp,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
                      DAG.getIntPtrConstant(0, DL));
 }
 
+/// Depending on uarch and/or optimizing for size, we might prefer to use a
+/// vector operation in place of the typical scalar operation.
+static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
+                             const X86Subtarget &Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT == MVT::f32 || VT == MVT::f64 && "Only expecting float/double");
+  return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
+}
+
 /// The only differences between FABS and FNEG are the mask and the logic op.
 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -23479,11 +23510,16 @@ static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
-static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
+static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
+  if (VT == MVT::i16 || VT == MVT::i32)
+    return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
+
   if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
+
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
@@ -26219,7 +26255,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
-  case ISD::SUB:                return LowerADD_SUB(Op, DAG);
+  case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
   case ISD::UADDSAT:
   case ISD::SADDSAT:
   case ISD::USUBSAT:
author	Sanjay Patel <spatel@rotateright.com>	2019-01-14 18:44:02 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2019-01-14 18:44:02 +0000
commit	b23ff7a0e251edffab75a498978224276bc1ebd0 (patch)
tree	85279606a44020e52464f2ff26f1b025f4842b2e /llvm/lib
parent	1b4623176447a8242acbcc509441dd2585f0815d (diff)
download	bcm5719-llvm-b23ff7a0e251edffab75a498978224276bc1ebd0.tar.gz bcm5719-llvm-b23ff7a0e251edffab75a498978224276bc1ebd0.zip