summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-04-03 21:06:51 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-04-03 21:06:51 +0000
commitaf33757b5dec5f99bc78f724a2eb2cd822c14b73 (patch)
treeb79baa363e9ca2afbdb8470ff7b522eba00312f8 /llvm/lib
parent3b392bb8d85d3cd6cf265e940884394d5f25d641 (diff)
downloadbcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.tar.gz
bcm5719-llvm-af33757b5dec5f99bc78f724a2eb2cd822c14b73.zip
[X86][SSE]] Lower BUILD_VECTOR with repeated elts as BUILD_VECTOR + VECTOR_SHUFFLE
It can be costly to transfer from the gprs to the xmm registers and can prevent loads merging. This patch splits vXi16/vXi32/vXi64 BUILD_VECTORS that use the same operand in multiple elements into a BUILD_VECTOR with only a single insertion of each of those elements and then performs an unary shuffle to duplicate the values. There are a couple of minor regressions this patch unearths due to some missing MOVDDUP/BROADCAST folds that I will address in a future patch. Note: Now that vector shuffle lowering and combining is pretty good we should be reusing that instead of duplicating so much in LowerBUILD_VECTOR - this is the first of several patches to address this. Differential Revision: https://reviews.llvm.org/D31373 llvm-svn: 299387
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp56
1 files changed, 55 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fc39d9bff71..50cd8ab1dbc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6120,6 +6120,54 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
return SDValue();
}
+// Attempt to lower a build vector of repeated elts as a build vector of unique
+// ops followed by a shuffle.
+static SDValue
+lowerBuildVectorWithRepeatedEltsUsingShuffle(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = V.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO - vXi8 insertions+shuffles often cause PSHUFBs which can lead to
+ // excessive/bulky shuffle mask creation.
+ if (VT.getScalarSizeInBits() < 16)
+ return SDValue();
+
+ // Create list of unique operands to be passed to a build vector and a shuffle
+ // mask describing the repetitions.
+ // TODO - we currently insert the first occurances in place - sometimes it
+ // might be better to insert them in other locations for shuffle efficiency.
+ bool HasRepeatedElts = false;
+ SmallVector<int, 16> Mask(NumElts, SM_SentinelUndef);
+ SmallVector<SDValue, 16> Uniques(V->op_begin(), V->op_end());
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue Op = Uniques[i];
+ if (Op.isUndef())
+ continue;
+ Mask[i] = i;
+
+ // Zeros can be efficiently repeated, so don't shuffle these.
+ if (X86::isZeroNode(Op))
+ continue;
+
+ // If any repeated operands are found then mark the build vector entry as
+ // undef and setup a copy in the shuffle mask.
+ for (unsigned j = i + 1; j != NumElts; ++j)
+ if (Op == Uniques[j]) {
+ HasRepeatedElts = true;
+ Mask[j] = i;
+ Uniques[j] = DAG.getUNDEF(VT.getScalarType());
+ }
+ }
+
+ if (!HasRepeatedElts)
+ return SDValue();
+
+ SDLoc DL(V);
+ return DAG.getVectorShuffle(VT, DL, DAG.getBuildVector(VT, DL, Uniques),
+ DAG.getUNDEF(VT), Mask);
+}
+
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
@@ -7752,11 +7800,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
- // See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+ // See if we can use a vector load to get all of the elements.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
return LD;
+
+ // Attempt to lower a build vector of repeated elts as single insertions
+ // followed by a shuffle.
+ if (SDValue V =
+ lowerBuildVectorWithRepeatedEltsUsingShuffle(Op, DAG, Subtarget))
+ return V;
}
// For AVX-length vectors, build the individual 128-bit pieces and use
OpenPOWER on IntegriCloud