summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2014-09-27 04:42:44 +0000
committerChandler Carruth <chandlerc@gmail.com>2014-09-27 04:42:44 +0000
commit4d03be17179c8a1a83716514f32b70b79a10e054 (patch)
treef75b68778957e4a1ceaf22cb4f102d64c7c245d9 /llvm/lib
parent81e6b29f03af3b7b33a6fb01500488c3a8ced811 (diff)
downloadbcm5719-llvm-4d03be17179c8a1a83716514f32b70b79a10e054.tar.gz
bcm5719-llvm-4d03be17179c8a1a83716514f32b70b79a10e054.zip
[x86] Fix terrible bugs everywhere in the new vector shuffle lowering
and in the target shuffle combining when trying to widen vector elements. Previously only one of these was correct, and we didn't correctly propagate zeroing target shuffle masks (which have a different sentinel value from undef in non- target shuffle masks now). This isn't just a missed optimization, this caused us to drop zeroing shuffles on the floor and miscompile code. The added test case is one example of that. There are other fixes to the test suite as a consequence of this as well as restoring the undef elements in some of the masks that were lost when I brought sanity to the actual *value* of the undef and zero sentinels. I've also just cleaned up some of the PSHUFD and PSHUFLW and PSHUFHW combining code, but that code really needs to go. It was a nice initial attempt, but it isn't very principled and the recursive shuffle combiner is much more powerful. llvm-svn: 218562
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp77
1 files changed, 54 insertions, 23 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d090529784..17f925bd3e8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9863,14 +9863,48 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
}
-/// \brief Tiny helper function to test whether a shuffle mask could be
+/// \brief Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
-static bool canWidenShuffleElements(ArrayRef<int> Mask) {
- for (int i = 0, Size = Mask.size(); i < Size; i += 2)
- if ((Mask[i] != -1 && Mask[i] % 2 != 0) ||
- (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 ||
- (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1]))))
- return false;
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // Check for any of the sentinel values (negative) and if they are the same,
+ // we can widen to that.
+ if (Mask[i] < 0 && Mask[i] == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i]);
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == -1 && Mask[i + 1] % 2 == 1) {
+ WidenedMask.push_back(Mask[i + 1] / 2);
+ continue;
+ }
+ if (Mask[i + 1] == -1 && Mask[i] % 2 == 0) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != -1 && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
return true;
}
@@ -9922,20 +9956,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// lanes but wider integers. We cap this to not form integers larger than i64
// but it might be interesting to form i128 integers to handle flipping the
// low and high halves of AVX 256-bit vectors.
+ SmallVector<int, 16> WidenedMask;
if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
- canWidenShuffleElements(Mask)) {
- SmallVector<int, 8> NewMask;
- for (int i = 0, Size = Mask.size(); i < Size; i += 2)
- NewMask.push_back(Mask[i] != -1
- ? Mask[i] / 2
- : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1));
+ canWidenShuffleElements(Mask, WidenedMask)) {
MVT NewVT =
MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
VT.getVectorNumElements() / 2);
V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+ DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
}
int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
@@ -20697,10 +20727,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- while (Mask.size() > 1 && canWidenShuffleElements(Mask)) {
- for (int i = 0, e = Mask.size() / 2; i < e; ++i)
- Mask[i] = Mask[2 * i] / 2;
- Mask.resize(Mask.size() / 2);
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ WidenedMask.clear();
}
return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
@@ -20971,12 +21001,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
return SDValue(); // We combined away this shuffle, so we're done.
// See if this reduces to a PSHUFD which is no more expensive and can
- // combine with more operations.
- if (canWidenShuffleElements(Mask)) {
- int DMask[] = {-1, -1, -1, -1};
+ // combine with more operations. Note that it has to at least flip the
+ // dwords as otherwise it would have been removed as a no-op.
+ if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3]) {
+ int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
- DMask[DOffset + 0] = DOffset + Mask[0] / 2;
- DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+ DMask[DOffset + 0] = DOffset + 1;
+ DMask[DOffset + 1] = DOffset + 0;
V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
DCI.AddToWorklist(V.getNode());
V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
OpenPOWER on IntegriCloud