4 files changed, 56 insertions, 55 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8eef9b58c57..d82f90e7311 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1458,6 +1458,31 @@ bool TargetLowering::SimplifyDemandedVectorElts(
                                    ZeroRHS, TLO, Depth + 1))
       return true;
 
+    // Simplify mask using undef elements from LHS/RHS.
+    bool Updated = false;
+    bool IdentityLHS = true, IdentityRHS = true;
+    SmallVector<int, 32> NewMask(ShuffleMask.begin(), ShuffleMask.end());
+    for (int i = 0; i != NumElts; ++i) {
+      int &M = NewMask[i];
+      if (M < 0)
+        continue;
+      if (!DemandedElts[i] || (M < (int)NumElts && UndefLHS[M]) ||
+          (M >= (int)NumElts && UndefRHS[M - NumElts])) {
+        Updated = true;
+        M = -1;
+      }
+      IdentityLHS &= (M < 0) || (M == i);
+      IdentityRHS &= (M < 0) || ((M - NumElts) == i);
+    }
+
+    // Update legal shuffle masks based on demanded elements if it won't reduce
+    // to Identity which can cause premature removal of the shuffle mask.
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
+        isShuffleMaskLegal(NewMask, VT))
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
+                                                    Op.getOperand(1), NewMask));
+
     // Propagate undef/zero elements from LHS/RHS.
     for (unsigned i = 0; i != NumElts; ++i) {
       int M = ShuffleMask[i];
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
index 97cc18e296f..1761567f791 100644
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -396,18 +396,14 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movaps (%eax), %xmm0
-; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    andpd {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movaps (%rax), %xmm0
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, <4 x float>* undef, align 16
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index ba162e63d9f..aff1d49063f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1563,22 +1563,22 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; SSSE3-LABEL: shuffle_v4i32_2456:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4i32_2456:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,1,2]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_2456:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX-NEXT:    retq
   %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 3ae044b0064..a2fa29ef5b2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -973,45 +973,25 @@ define <32 x i8> @PR27320(<8 x i32> %a0) {
 }
 
 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
-; X32-AVX2-LABEL: PR34577:
-; X32-AVX2:       # %bb.0: # %entry
-; X32-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X32-AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; X32-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X32-AVX2-NEXT:    retl
-;
-; X32-AVX512-LABEL: PR34577:
-; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X32-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X32-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X32-AVX512-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; X32-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X32-AVX512-NEXT:    retl
-;
-; X64-AVX2-LABEL: PR34577:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X64-AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: PR34577:
-; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X64-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; X64-AVX512-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; X64-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; X64-AVX512-NEXT:    retq
+; X32-LABEL: PR34577:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X32-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: PR34577:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X64-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X64-NEXT:    retq
 entry:
   %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
   %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer