summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2015-09-22 08:16:08 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2015-09-22 08:16:08 +0000
commit1cad0cd3ce0beffc4819ec3a9518657d35e0c9ae (patch)
tree3c580e2c5fe83a91e2a3ce88a5a992a23b409ee2 /llvm/test
parent67d01489b8ca7e07480a5232baa785489cc813dd (diff)
downloadbcm5719-llvm-1cad0cd3ce0beffc4819ec3a9518657d35e0c9ae.tar.gz
bcm5719-llvm-1cad0cd3ce0beffc4819ec3a9518657d35e0c9ae.zip
[X86][SSE] Match zero/any extension shuffles that don't start from the first element
This patch generalizes the lowering of shuffles as zero extensions to allow extensions that don't start from the first element. It now recognises extensions starting anywhere in the lower 128-bits or at the start of any higher 128-bit lane. The motivation was to reduce the number of high cost pshufb calls, but it also improves the SSE2 case as well. Differential Revision: http://reviews.llvm.org/D12561 llvm-svn: 248250
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/machine-cp.ll32
-rw-r--r--llvm/test/CodeGen/X86/vec_cast2.ll3
-rw-r--r--llvm/test/CodeGen/X86/vec_int_to_fp.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-sext.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll7
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll18
-rw-r--r--llvm/test/CodeGen/X86/vector-zext.ll310
8 files changed, 237 insertions, 163 deletions
diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index d3b305b3cb8..143a1c3787a 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -66,25 +66,23 @@ while.end: ; preds = %while.body, %entry
;
; CHECK-LABEL: foo:
; CHECK: psllw $7,
-; CHECK: psllw $7,
-; CHECK-NEXT: pand
-; CHECK-NEXT: pcmpgtb
-; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC:%xmm[0-9]+]]
-; Machine propagation used to delete the first copy as the
-; first few uses were <undef>.
-; CHECK-NEXT: movdqa [[SRC]], [[CPY1:%xmm[0-9]+]]
-; CHECK: punpcklbw [[CPY1]], [[CPY1]]
-; CHECK-NEXT: punpcklwd [[CPY1]], [[CPY1]]
+; CHECK: psllw $7, [[SRC1:%xmm[0-9]+]]
+; CHECK-NEXT: pand {{.*}}(%rip), [[SRC1]]
+; CHECK-NEXT: pcmpgtb [[SRC1]], [[SRC2:%xmm[0-9]+]]
+; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC2]]
+; CHECK-NEXT: movdqa [[SRC2]], [[CPY1:%xmm[0-9]+]]
+; CHECK-NEXT: punpcklbw %xmm{{[0-9]+}}, [[CPY1]]
+; Check that CPY1 is not redefined.
+; CHECK-NOT: , [[CPY1]]
+; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY1]]
; CHECK-NEXT: pslld $31, [[CPY1]]
-; CHECK: movdqa [[SRC]], [[CPY2:%xmm[0-9]+]]
-; CHECK: punpcklbw [[CPY2]], [[CPY2]]
-; CHECK-NEXT: punpckhwd [[CPY2]], [[CPY2]]
+; CHECK-NEXT: psrad $31, [[CPY1]]
+; CHECK: punpckhbw %xmm{{[0-9]+}}, [[CPY2:%xmm[0-9]+]]
+; Check that CPY2 is not redefined.
+; CHECK-NOT: , [[CPY2]]
+; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY2]]
; CHECK-NEXT: pslld $31, [[CPY2]]
-; CHECK: punpckhbw [[SRC]],
-; Check that CPY1 is not redefined.
-; CHECK-NOT: , [[CPY1]]
-; undef use, we do not care.
-; CHECK: punpcklwd [[CPY1]],
+; CHECK-NEXT: psrad $31, [[CPY2]]
define <16 x float> @foo(<16 x float> %x) {
bb:
%v3 = icmp slt <16 x i32> undef, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll
index 311c7870b4b..ec766044fab 100644
--- a/llvm/test/CodeGen/X86/vec_cast2.ll
+++ b/llvm/test/CodeGen/X86/vec_cast2.ll
@@ -57,7 +57,8 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
; CHECK-WIDE-LABEL: foo2_8:
; CHECK-WIDE: ## BB#0:
; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-WIDE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 3def8032d9d..0f483d3d8c1 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1744,13 +1744,11 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_8i8_to_8f32:
; SSE: # BB#0:
; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@@ -1782,13 +1780,11 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_16i8_to_8f32:
; SSE: # BB#0:
; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index cab50c8ce5c..fb95138b6ce 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -861,7 +861,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
; SSE2-NEXT: psllq $63, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -891,7 +891,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSSE3-NEXT: psllq $63, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
; SSSE3-NEXT: psllq $63, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -1347,7 +1347,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
@@ -1396,7 +1396,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: retq
@@ -1704,7 +1704,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw $15, %xmm1
; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: retq
@@ -1793,7 +1793,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psllw $15, %xmm0
; SSSE3-NEXT: psraw $15, %xmm0
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSSE3-NEXT: psllw $15, %xmm1
; SSSE3-NEXT: psraw $15, %xmm1
; SSSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 37ed743ea1d..1de75bcf562 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1439,11 +1439,10 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index b9686dbbf64..ae4407916fa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1755,7 +1755,8 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_
; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1772,7 +1773,8 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_
; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 26062335cc1..3b3e7d8e1ae 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -36,6 +36,24 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
ret <16 x i8> %s
}
+define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm0, %xmm1
+; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpsrld $16, %xmm0, %xmm0
+; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT: retq
+ %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <16 x i8> %s
+}
+
define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
; BTVER1: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index e7ac8e40ab0..eb8fd16e460 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -110,27 +110,28 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i32:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_8i32:
@@ -187,17 +188,13 @@ entry:
define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_4i64:
@@ -209,9 +206,10 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSE41-LABEL: zext_16i8_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_4i64:
@@ -339,30 +337,29 @@ entry:
define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5],zero,zero,zero,zero,zero,zero,xmm1[6,7],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_4i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -533,7 +530,7 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -609,7 +606,7 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -757,7 +754,7 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -1050,12 +1047,10 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
@@ -1063,25 +1058,27 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i8_to_8i32:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i8_to_8i32:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1099,11 +1096,10 @@ entry:
define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
@@ -1113,12 +1109,14 @@ define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable
;
; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlq $48, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6:
; AVX: # BB#0: # %entry
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1129,15 +1127,15 @@ entry:
define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,6,7]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
@@ -1150,22 +1148,26 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,zero,zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,zero,zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1176,27 +1178,30 @@ entry:
define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
+; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6:
; AVX: # BB#0: # %entry
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8>
@@ -1207,51 +1212,46 @@ entry:
define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[10,11],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,0,1,10,11,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,0,1,10,11,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
@@ -1280,14 +1280,32 @@ entry:
}
define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: retq
+; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
; AVX1: # BB#0: # %entry
@@ -1300,11 +1318,8 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
;
; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
@@ -1312,6 +1327,53 @@ entry:
ret <8 x i32> %Z
}
+define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: retq
+entry:
+ %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
+ %Z = bitcast <16 x i16> %B to <8 x i32>
+ ret <8 x i32> %Z
+}
+
define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
; SSE: # BB#0: # %entry
@@ -1365,10 +1427,8 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
;
; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,2,u,3,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
OpenPOWER on IntegriCloud