[X86][SSE] Fold scalar horizontal add/sub for non-0/1 element extractions

We already perform horizontal add/sub if we extract from elements 0 and 1, this patch extends it to non-0/1 element extraction indices (as long as they are from the lowest 128-bit vector). Differential Revision: https://reviews.llvm.org/D61263 llvm-svn: 359707
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-05-01 17:13:35 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-05-01 17:13:35 +0000
commit: 9f04d97cd71a4248e3d064d6d194699e688d1358 (patch)
tree: 496a313f5bfb2e3492817b744721fc8bca109244 /llvm/lib
parent: fa78ad57edfd2fec2882c0ca1dbd7eec34051844 (diff)
download: bcm5719-llvm-9f04d97cd71a4248e3d064d6d194699e688d1358.tar.gz
bcm5719-llvm-9f04d97cd71a4248e3d064d6d194699e688d1358.zip
1 files changed, 11 insertions, 6 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f92e5138d1..aed2c0745af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19020,33 +19020,38 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
   }
   unsigned LExtIndex = LHS.getConstantOperandVal(1);
   unsigned RExtIndex = RHS.getConstantOperandVal(1);
-  if (LExtIndex == 1 && RExtIndex == 0 &&
+  if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
     std::swap(LExtIndex, RExtIndex);
 
-  // TODO: This can be extended to handle other adjacent extract pairs.
-  if (LExtIndex != 0 || RExtIndex != 1)
+  if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
     return Op;
 
   SDValue X = LHS.getOperand(0);
   EVT VecVT = X.getValueType();
   unsigned BitWidth = VecVT.getSizeInBits();
+  unsigned NumLanes = BitWidth / 128;
+  unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
          "Not expecting illegal vector widths here");
 
   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
-  // equivalent, so extract the 256/512-bit source op to 128-bit.
-  // This is free: ymm/zmm -> xmm.
+  // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
+  // This is free if we're extracting from the bottom lane: ymm/zmm -> xmm.
+  if (NumEltsPerLane <= LExtIndex)
+   return Op;
+
   SDLoc DL(Op);
   if (BitWidth == 256 || BitWidth == 512)
     X = extract128BitVector(X, 0, DAG, DL);
 
   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+  // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getIntPtrConstant(LExtIndex / 2, DL));
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-05-01 17:13:35 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-05-01 17:13:35 +0000
commit	9f04d97cd71a4248e3d064d6d194699e688d1358 (patch)
tree	496a313f5bfb2e3492817b744721fc8bca109244 /llvm/lib
parent	fa78ad57edfd2fec2882c0ca1dbd7eec34051844 (diff)
download	bcm5719-llvm-9f04d97cd71a4248e3d064d6d194699e688d1358.tar.gz bcm5719-llvm-9f04d97cd71a4248e3d064d6d194699e688d1358.zip