2 files changed, 135 insertions, 14 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index eab96a4f47e..0f60bf8a7b3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "PPCISelLowering.h"
 #include "PPCTargetMachine.h"
+#include "PPCPerfectShuffle.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -1123,6 +1124,88 @@ static SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
   return SDOperand();
 }
 
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDOperand GeneratePerfectShuffle(unsigned PFEntry, SDOperand LHS,
+                                        SDOperand RHS, SelectionDAG &DAG) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID  = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+  
+  enum {
+    OP_COPY = 0,   // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12,
+  };
+  
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+  
+  unsigned ShufIdxs[16];
+  switch (OpNum) {
+  default: assert(0 && "Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = i+4;
+    break;
+  case OP_VSLDOI8:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = i+8;
+    break;
+  case OP_VSLDOI12:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = i+12;
+    break;
+  }
+  std::vector<SDOperand> Ops;
+  for (unsigned i = 0; i != 16; ++i)
+    Ops.push_back(DAG.getConstant(ShufIdxs[i], MVT::i32));
+  SDOperand OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG);
+  
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, OpLHS.getValueType(), OpLHS, OpRHS,
+                     DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops));
+}
+
 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
 /// is a shuffle we can handle in a single instruction, return it.  Otherwise,
 /// return the code it can be lowered into.  Worst case, it can always be
@@ -1166,8 +1249,58 @@ static SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
       PPC::isVMRGHShuffleMask(PermMask.Val, 4, false))
     return Op;
   
-  // TODO: Handle more cases, and also handle cases that are cheaper to do as
-  // multiple such instructions than as a constant pool load/vperm pair.
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask.getOperand(i*4+j).getOpcode() == ISD::UNDEF)
+        continue;   // Undef, ignore it.
+      
+      unsigned ByteSource = 
+        cast<ConstantSDNode>(PermMask.getOperand(i*4+j))->getValue();
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+      
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+    
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  if (isFourElementShuffle) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = 
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+    
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information 
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3) 
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG);
+  }
   
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
   // vector that will get spilled to the constant pool.
diff --git a/llvm/lib/Target/PowerPC/README_ALTIVEC.txt b/llvm/lib/Target/PowerPC/README_ALTIVEC.txt
index f5a7c173691..5d7ecd5691b 100644
--- a/llvm/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/llvm/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -101,18 +101,6 @@ void test(vector int *X, vector int *Y) {
 
 //===----------------------------------------------------------------------===//
 
-There are a wide variety of vector_shuffle operations that we can do with a pair
-of instructions (e.g. a vsldoi + vpkuhum).  We should pattern match these, but
-there are a huge number of these.
-
-Specific examples:
-
-C = vector_shuffle A, B, <0, 1, 2, 4>
-->  t = vsldoi A, A, 12
-->  C = vsldoi A, B, 4
-
-//===----------------------------------------------------------------------===//
-
 extract_vector_elt of an arbitrary constant vector can be done with the 
 following instructions: