5 files changed, 128 insertions, 97 deletions
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d43ac19e3f3..73cecdb967e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -842,20 +842,15 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
   // Also handle the case where we explicitly require zeros in the top
   // elements.  This is a vector shuffle from the zero vector.
   if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
-      N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
       N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
       N.getOperand(1).Val->hasOneUse() &&
       ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
       N.getOperand(1).getOperand(0).hasOneUse()) {
-    // Check to see if the BUILD_VECTOR is building a zero vector.
-    SDOperand BV = N.getOperand(0);
-    for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
-      if (!isZeroNode(BV.getOperand(i)) &&
-          BV.getOperand(i).getOpcode() != ISD::UNDEF)
-        return false;  // Not a zero/undef vector.
     // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
     // from the LHS.
-    unsigned VecWidth = BV.getNumOperands();
+    unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
     SDOperand ShufMask = N.getOperand(2);
     assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d3c89f6de96..3fcae95e459 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2728,7 +2728,7 @@ static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
   return true;
 }
 
-/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as
 /// values in ther permute mask.
 static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
                                       SDOperand &V2, SDOperand &Mask,
@@ -2867,23 +2867,24 @@ static bool isZeroShuffle(SDNode *N) {
   unsigned NumElems = Mask.getNumOperands();
   for (unsigned i = 0; i != NumElems; ++i) {
     SDOperand Arg = Mask.getOperand(i);
-    if (Arg.getOpcode() != ISD::UNDEF) {
-      unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
-      if (Idx < NumElems) {
-        unsigned Opc = V1.Val->getOpcode();
-        if (Opc == ISD::UNDEF)
-          continue;
-        if (Opc != ISD::BUILD_VECTOR ||
-            !isZeroNode(V1.Val->getOperand(Idx)))
-          return false;
-      } else if (Idx >= NumElems) {
-        unsigned Opc = V2.Val->getOpcode();
-        if (Opc == ISD::UNDEF)
-          continue;
-        if (Opc != ISD::BUILD_VECTOR ||
-            !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
-          return false;
-      }
+    if (Arg.getOpcode() == ISD::UNDEF)
+      continue;
+    
+    unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+    if (Idx < NumElems) {
+      unsigned Opc = V1.Val->getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.Val))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !isZeroNode(V1.Val->getOperand(Idx)))
+        return false;
+    } else if (Idx >= NumElems) {
+      unsigned Opc = V2.Val->getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.Val))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+        return false;
     }
   }
   return true;
@@ -2893,14 +2894,35 @@ static bool isZeroShuffle(SDNode *N) {
 ///
 static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
   assert(MVT::isVector(VT) && "Expected a vector type");
-  unsigned NumElems = MVT::getVectorNumElements(VT);
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
-  bool isFP = MVT::isFloatingPoint(EVT);
-  SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
-  SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
-  return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+  
+  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
+  SDOperand Vec;
+  if (MVT::getSizeInBits(VT) == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDOperand getOnesVector(MVT::ValueType VT, SelectionDAG &DAG) {
+  assert(MVT::isVector(VT) && "Expected a vector type");
+  
+  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDOperand Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDOperand Vec;
+  if (MVT::getSizeInBits(VT) == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
 }
 
+
 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
 /// that point to V2 points to its first element.
 static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
@@ -2981,24 +3003,28 @@ static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
   }
   V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
 
-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
-  Mask = getZeroVector(MaskVT, DAG);
+  Mask = getZeroVector(MVT::v4i32, DAG);
   SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
                                   DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
   return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
 }
 
 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
-/// vector of zero or undef vector.
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
                                              unsigned NumElems, unsigned Idx,
                                              bool isZero, SelectionDAG &DAG) {
   SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
   MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
   MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
-  SDOperand Zero = DAG.getConstant(0, EVT);
-  SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
-  MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+  SmallVector<SDOperand, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    if (i == Idx)  // If this is the insertion idx, put the low elt of V2 here.
+      MaskVec.push_back(DAG.getConstant(NumElems, EVT));
+    else
+      MaskVec.push_back(DAG.getConstant(i, EVT));
   SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                &MaskVec[0], MaskVec.size());
   return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
@@ -3078,13 +3104,18 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
 
 SDOperand
 X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
-  // All zero's are handled with pxor.
-  if (ISD::isBuildVectorAllZeros(Op.Val))
-    return Op;
+  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllZeros(Op.Val) || ISD::isBuildVectorAllOnes(Op.Val)) {
+    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+    // eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+      return Op;
 
-  // All one's are handled with pcmpeqd.
-  if (ISD::isBuildVectorAllOnes(Op.Val))
-    return Op;
+    if (ISD::isBuildVectorAllOnes(Op.Val))
+      return getOnesVector(Op.getValueType(), DAG);
+    return getZeroVector(Op.getValueType(), DAG);
+  }
 
   MVT::ValueType VT = Op.getValueType();
   MVT::ValueType EVT = MVT::getVectorElementType(VT);
@@ -3113,12 +3144,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
   }
 
   if (NumNonZero == 0) {
-    if (NumZero == 0)
-      // All undef vector. Return an UNDEF.
-      return DAG.getNode(ISD::UNDEF, VT);
-    else
-      // A mix of zero and undef. Return a zero vector.
-      return getZeroVector(VT, DAG);
+    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+    return DAG.getNode(ISD::UNDEF, VT);
   }
 
   // Splat is obviously ok. Let legalizer expand it to a shuffle.
@@ -3299,8 +3326,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
     return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
 
   bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
   V1IsSplat = isSplatVector(V1.Val);
   V2IsSplat = isSplatVector(V2.Val);
+  
+  // Canonicalize the splat or undef, if present, to be on the RHS.
   if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
     Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
     std::swap(V1IsSplat, V2IsSplat);
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index b7024bc8766..c892c342334 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -486,14 +486,13 @@ def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
 //===----------------------------------------------------------------------===//
 
 // Alias instructions that map zero vector to pxor.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in {
   def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
                               "pxor\t$dst, $dst",
-                              [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+                              [(set VR64:$dst, (v2i32 immAllZerosV))]>;
   def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins),
                               "pcmpeqd\t$dst, $dst",
-                              [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+                              [(set VR64:$dst, (v2i32 immAllOnesV))]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -510,18 +509,6 @@ def : Pat<(store (v2i32 VR64:$src), addr:$dst),
 def : Pat<(store (v1i64 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 
-// 64-bit vector all zero's.
-def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
-
-// 64-bit vector all one's.
-def : Pat<(v8i8  immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
-
 // Bit convert.
 def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
@@ -551,10 +538,10 @@ def MMX_X86s2vec : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+  def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
                     (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
             (MMX_MOVZDI2PDIrr GR32:$src)>;
-  def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+  def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
                     (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
             (MMX_MOVZDI2PDIrr GR32:$src)>;
   def : Pat<(v2i32 (vector_shuffle immAllZerosV,
@@ -606,19 +593,19 @@ let AddedComplexity = 20 in {
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   VR64:$src2)),
           (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
                   VR64:$src2)),
           (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
                   VR64:$src2)),
           (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
 
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 2c86e8d1c33..da23ccbaa09 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -939,11 +939,10 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
                   "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
 
 // Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in
 def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
                  "xorps\t$dst, $dst",
-                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
 
 // FR32 to 128-bit vector conversion.
 def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
@@ -991,7 +990,7 @@ let isTwoAddress = 1 in {
 let AddedComplexity = 20 in
 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
                       "movss\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
                                  (v4f32 (scalar_to_vector (loadf32 addr:$src))),
                                                 MOVL_shuffle_mask)))]>;
 
@@ -2119,11 +2118,10 @@ def MFENCE : I<0xAE, MRM6m, (outs), (ins),
 
 
 // Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
                          "pcmpeqd\t$dst, $dst",
-                         [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
 
 // FR64 to 128-bit vector conversion.
 def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
@@ -2220,7 +2218,7 @@ let AddedComplexity = 20 in
   def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                         "movsd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (vector_shuffle immAllZerosV,
+                          (v2f64 (vector_shuffle immAllZerosV_bc,
                                   (v2f64 (scalar_to_vector
                                           (loadf64 addr:$src))),
                                   MOVL_shuffle_mask)))]>;
@@ -2692,21 +2690,6 @@ def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
 def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
 def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
 
-// 128-bit vector all zero's.
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-
-// 128-bit vector all one's.
-def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
-
-
 // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
 // 16-bits matter.
 def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
@@ -2751,17 +2734,17 @@ let Predicates = [HasSSE2] in {
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+def : Pat<(v8i16 (vector_shuffle immAllZerosV_bc,
                   (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
           (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+def : Pat<(v16i8 (vector_shuffle immAllZerosV_bc,
                   (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
           (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
                   (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
           (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
                   (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
           (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
 }
@@ -2911,7 +2894,7 @@ def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
 
 // Set lowest element and zero upper elements.
 let AddedComplexity = 20 in
-def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV_bc,
                      (v2f64 (scalar_to_vector (loadf64 addr:$src))),
                      MOVL_shuffle_mask)),
           (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
new file mode 100644
index 00000000000..b882bad1aff
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -0,0 +1,35 @@
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pxor | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep xorps | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pcmpeqd | count 2
+
+@M1 = external global <1 x i64>
+@M2 = external global <2 x i32>
+
+@S1 = external global <2 x i64>
+@S2 = external global <4 x i32>
+
+define void @test() {
+  store <1 x i64> zeroinitializer, <1 x i64>* @M1
+  store <2 x i32> zeroinitializer, <2 x i32>* @M2
+  ret void
+}
+
+define void @test2() {
+  store <1 x i64> < i64 -1 >, <1 x i64>* @M1
+  store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
+  ret void
+}
+
+define void @test3() {
+  store <2 x i64> zeroinitializer, <2 x i64>* @S1
+  store <4 x i32> zeroinitializer, <4 x i32>* @S2
+  ret void
+}
+
+define void @test4() {
+  store <2 x i64> < i64 -1, i64 -1>, <2 x i64>* @S1
+  store <4 x i32> < i32 -1, i32 -1, i32 -1, i32 -1 >, <4 x i32>* @S2
+  ret void
+}
+
+