[x86] Remove the insanely over-aggressive unpack lowering strategy for

v16i8 shuffles, and replace it with new facilities. This uses precise patterns to match exact unpacks, and the new generalized unpack lowering only when we detect a case where we will have to shuffle both inputs anyways and they terminate in exactly a blend. This fixes all of the blend horrors that I uncovered by always lowering blends through the vector shuffle lowering. It also removes *sooooo* much of the crazy instruction sequences required for v16i8 lowering previously. Much cleaner now. The only "meh" aspect is that we sometimes use pshufb+pshufb+unpck when it would be marginally nicer to use pshufb+pshufb+por. However, the difference there is *tiny*. In many cases its a win because we re-use the pshufb mask. In others, we get to avoid the pshufb entirely. I've left a FIXME, but I'm dubious we can really do better than this. I'm actually pretty happy with this lowering now. For SSE2 this exposes some horrors that were really already there. Those will have to fixed by changing a different path through the v16i8 lowering. llvm-svn: 229846
author: Chandler Carruth <chandlerc@gmail.com> 2015-02-19 12:10:37 +0000
committer: Chandler Carruth <chandlerc@gmail.com> 2015-02-19 12:10:37 +0000
commit: 8817e5e01be2b15a0b2e4a101491a0fd5deb814b (patch)
tree: 51a64fdf199ffad021fbc1fa6feeb8b2572bd5b6 /llvm/lib
parent: 5d171fc291785f88f2da929e277c8edfc65cae75 (diff)
download: bcm5719-llvm-8817e5e01be2b15a0b2e4a101491a0fd5deb814b.tar.gz
bcm5719-llvm-8817e5e01be2b15a0b2e4a101491a0fd5deb814b.zip
1 files changed, 30 insertions, 38 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7e5fa84f75c..d1d0a434980 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9968,36 +9968,15 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return V;
   }
 
-  // Check whether an interleaving lowering is likely to be more efficient.
-  // This isn't perfect but it is a strong heuristic that tends to work well on
-  // the kinds of shuffles that show up in practice.
-  //
-  // FIXME: We need to handle other interleaving widths (i16, i32, ...).
-  if (shouldLowerAsInterleaving(Mask)) {
-    int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
-      return (M >= 0 && M < 8) || (M >= 16 && M < 24);
-    });
-    int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
-      return (M >= 8 && M < 16) || M >= 24;
-    });
-    int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
-                     -1, -1, -1, -1, -1, -1, -1, -1};
-    int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
-                     -1, -1, -1, -1, -1, -1, -1, -1};
-    bool UnpackLo = NumLoHalf >= NumHiHalf;
-    MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
-    MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
-    for (int i = 0; i < 8; ++i) {
-      TargetEMask[i] = Mask[2 * i];
-      TargetOMask[i] = Mask[2 * i + 1];
-    }
-
-    SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
-    SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
-
-    return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
-                       MVT::v16i8, Evens, Odds);
-  }
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          0,  16,  1, 17,  2, 18,  3, 19,
+                          4,  20,  5, 21,  6, 22,  7, 23))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          8,  24,  9, 25, 10, 26, 11, 27,
+                          12, 28, 13, 29, 14, 30, 15, 31))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
@@ -10035,14 +10014,27 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       }
     }
 
-    // If both V1 and V2 are in use and we can use a direct blend, do so. This
-    // avoids using blends to handle blends-with-zero which is important as
-    // a single pshufb is significantly faster for that.
-    if (V1InUse && V2InUse && Subtarget->hasSSE41())
-      if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
-                                                    Subtarget, DAG))
-        return Blend;
-
+    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+    // do so. This avoids using them to handle blends-with-zero which is
+    // important as a single pshufb is significantly faster for that.
+    if (V1InUse && V2InUse) {
+      if (Subtarget->hasSSE41())
+        if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, Subtarget, DAG))
+          return Blend;
+
+      // We can use an unpack to do the blending rather than an or in some
+      // cases. Even though the or may be (very minorly) more efficient, we
+      // preference this lowering because there are common cases where part of
+      // the complexity of the shuffles goes away when we do the final blend as
+      // an unpack.
+      // FIXME: It might be worth trying to detect if the unpack-feeding
+      // shuffles will both be pshufb, in which case we shouldn't bother with
+      // this.
+      if (SDValue Unpack =
+              lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
+        return Unpack;
+    }
 
     if (V1InUse)
       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
author	Chandler Carruth <chandlerc@gmail.com>	2015-02-19 12:10:37 +0000
committer	Chandler Carruth <chandlerc@gmail.com>	2015-02-19 12:10:37 +0000
commit	8817e5e01be2b15a0b2e4a101491a0fd5deb814b (patch)
tree	51a64fdf199ffad021fbc1fa6feeb8b2572bd5b6 /llvm/lib
parent	5d171fc291785f88f2da929e277c8edfc65cae75 (diff)
download	bcm5719-llvm-8817e5e01be2b15a0b2e4a101491a0fd5deb814b.tar.gz bcm5719-llvm-8817e5e01be2b15a0b2e4a101491a0fd5deb814b.zip