[X86][SSE]] Lower BUILD_VECTOR with repeated elts as BUILD_VECTOR + VECTOR_SHUFFLE

It can be costly to transfer from the gprs to the xmm registers and can prevent loads merging. This patch splits vXi16/vXi32/vXi64 BUILD_VECTORS that use the same operand in multiple elements into a BUILD_VECTOR with only a single insertion of each of those elements and then performs an unary shuffle to duplicate the values. There are a couple of minor regressions this patch unearths due to some missing MOVDDUP/BROADCAST folds that I will address in a future patch. Note: Now that vector shuffle lowering and combining is pretty good we should be reusing that instead of duplicating so much in LowerBUILD_VECTOR - this is the first of several patches to address this. Differential Revision: https://reviews.llvm.org/D31373 llvm-svn: 299387
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2017-04-03 21:06:51 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2017-04-03 21:06:51 +0000
commit: af33757b5dec5f99bc78f724a2eb2cd822c14b73 (patch)
tree: b79baa363e9ca2afbdb8470ff7b522eba00312f8 /llvm/lib
parent: 3b392bb8d85d3cd6cf265e940884394d5f25d641 (diff)
download: bcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.tar.gz
bcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.zip
1 files changed, 55 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fc39d9bff71..50cd8ab1dbc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6120,6 +6120,54 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Attempt to lower a build vector of repeated elts as a build vector of unique
+// ops followed by a shuffle.
+static SDValue
+lowerBuildVectorWithRepeatedEltsUsingShuffle(SDValue V, SelectionDAG &DAG,
+                                             const X86Subtarget &Subtarget) {
+  MVT VT = V.getSimpleValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // TODO - vXi8 insertions+shuffles often cause PSHUFBs which can lead to
+  // excessive/bulky shuffle mask creation.
+  if (VT.getScalarSizeInBits() < 16)
+    return SDValue();
+
+  // Create list of unique operands to be passed to a build vector and a shuffle
+  // mask describing the repetitions.
+  // TODO - we currently insert the first occurances in place - sometimes it
+  // might be better to insert them in other locations for shuffle efficiency.
+  bool HasRepeatedElts = false;
+  SmallVector<int, 16> Mask(NumElts, SM_SentinelUndef);
+  SmallVector<SDValue, 16> Uniques(V->op_begin(), V->op_end());
+  for (unsigned i = 0; i != NumElts; ++i) {
+    SDValue Op = Uniques[i];
+    if (Op.isUndef())
+      continue;
+    Mask[i] = i;
+
+    // Zeros can be efficiently repeated, so don't shuffle these.
+    if (X86::isZeroNode(Op))
+      continue;
+
+    // If any repeated operands are found then mark the build vector entry as
+    // undef and setup a copy in the shuffle mask.
+    for (unsigned j = i + 1; j != NumElts; ++j)
+      if (Op == Uniques[j]) {
+        HasRepeatedElts = true;
+        Mask[j] = i;
+        Uniques[j] = DAG.getUNDEF(VT.getScalarType());
+      }
+  }
+
+  if (!HasRepeatedElts)
+    return SDValue();
+
+  SDLoc DL(V);
+  return DAG.getVectorShuffle(VT, DL, DAG.getBuildVector(VT, DL, Uniques),
+                              DAG.getUNDEF(VT), Mask);
+}
+
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
@@ -7752,11 +7800,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
-  // See if we can use a vector load to get all of the elements.
   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+    // See if we can use a vector load to get all of the elements.
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
     if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
       return LD;
+
+    // Attempt to lower a build vector of repeated elts as single insertions
+    // followed by a shuffle.
+    if (SDValue V =
+            lowerBuildVectorWithRepeatedEltsUsingShuffle(Op, DAG, Subtarget))
+      return V;
   }
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2017-04-03 21:06:51 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2017-04-03 21:06:51 +0000
commit	af33757b5dec5f99bc78f724a2eb2cd822c14b73 (patch)
tree	b79baa363e9ca2afbdb8470ff7b522eba00312f8 /llvm/lib
parent	3b392bb8d85d3cd6cf265e940884394d5f25d641 (diff)
download	bcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.tar.gz bcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.zip