diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-11-18 21:28:47 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-11-18 21:28:47 +0000 |
| commit | 950f3842ccaf0ccc2e52928fb49a9d3b3164e96e (patch) | |
| tree | 3783f9a0614027c243440db8fd6ba322f230fcb4 | |
| parent | f8610fc4e72d15c5a412517039b14864e22e0c22 (diff) | |
| download | bcm5719-llvm-950f3842ccaf0ccc2e52928fb49a9d3b3164e96e.tar.gz bcm5719-llvm-950f3842ccaf0ccc2e52928fb49a9d3b3164e96e.zip | |
[X86] Add a 32-bit command line with only sse2 to vector-sext.ll and vector-sext.ll to show some of the scalarized load sequences without 64-bit scalar support.
Some of these sequeces look pretty bad since we have to copy the sign bit from a 32 bit register to a 64 bit register to finish a sign extend.
llvm-svn: 347175
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-sext-widen.ll | 1036 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-sext.ll | 1039 |
2 files changed, 2073 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll index 44d9ee5b2f9..85334ccd835 100644 --- a/llvm/test/CodeGen/X86/vector-sext-widen.ll +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -7,7 +7,8 @@ ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; -; Just one 32-bit run to make sure we do reasonable things there. +; Just two 32-bit runs to make sure we do reasonable things there. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { @@ -33,6 +34,12 @@ define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 @@ -88,6 +95,15 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 @@ -178,6 +194,21 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: sext_32i8_to_32i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X32-SSE2-NEXT: psraw $8, %xmm2 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm3 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_32i8_to_32i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 @@ -219,6 +250,13 @@ define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 @@ -276,6 +314,16 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2 @@ -361,6 +409,22 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X32-SSE2-NEXT: psrad $24, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_16i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 @@ -408,6 +472,16 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0 @@ -479,6 +553,23 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 @@ -589,6 +680,34 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4 @@ -630,6 +749,12 @@ define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0 @@ -685,6 +810,15 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 @@ -765,6 +899,20 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i16_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm5 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i16_to_16i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 @@ -810,6 +958,15 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0 @@ -877,6 +1034,21 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2 @@ -982,6 +1154,32 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 @@ -1023,6 +1221,13 @@ define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i32_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 @@ -1082,6 +1287,17 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 @@ -1172,6 +1388,25 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i32_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i32_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 @@ -1246,6 +1481,22 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i1_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i1_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1301,6 +1552,19 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i8_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1437,6 +1701,30 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i1_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $28, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i1_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1490,6 +1778,15 @@ define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1640,6 +1937,33 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; X32-SSE2-NEXT: psllq $63, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; X32-SSE2-NEXT: psllq $63, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i1_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1723,6 +2047,33 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 1(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm1 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1780,6 +2131,22 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2000,6 +2367,49 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i1_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $25, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $26, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $27, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $28, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i1_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2066,6 +2476,14 @@ define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2159,6 +2577,55 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 1(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: movsbl 5(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE2-NEXT: movsbl 4(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-SSE2-NEXT: movsbl 7(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm4 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X32-SSE2-NEXT: movsbl 6(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm5 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2394,6 +2861,53 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i1_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $6, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $5, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $4, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pslld $31, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: pslld $31, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i1_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2490,6 +3004,19 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2905,6 +3432,97 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i1_to_16i8: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: pushl %ebx +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shrl $15, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm2 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shll $17, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: shll $18, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm1 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: shll $19, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $20, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: shll $21, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm6 +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: shll $22, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shll $23, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm5 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: shll $28, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm2 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movsbl %al, %eax +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; X32-SSE2-NEXT: shll $30, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X32-SSE2-NEXT: shll $31, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-SSE2-NEXT: shll $26, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-SSE2-NEXT: shll $27, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE2-NEXT: shll $25, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm1 +; X32-SSE2-NEXT: shrl $7, %eax +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -3430,6 +4048,93 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i1_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $14, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $13, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $12, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $11, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $10, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $9, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $8, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $6, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $5, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $4, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: movd %eax, %xmm4 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psllw $15, %xmm0 +; X32-SSE2-NEXT: psraw $15, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psllw $15, %xmm1 +; X32-SSE2-NEXT: psraw $15, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4285,6 +4990,179 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_32i1_to_32i8: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: pushl %ebx +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: subl $28, %esp +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl (%eax), %edx +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shll $17, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm4 +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: shll $18, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm1 +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: shll $19, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm2 +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: shll $20, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm5 +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: shll $21, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm6 +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shll $22, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: shll $23, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm3 +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; X32-SSE2-NEXT: shll $28, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm0 +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-SSE2-NEXT: shll $29, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm1 +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: movsbl %dl, %edx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: shll $31, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: shll $26, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm7 +; X32-SSE2-NEXT: shll $27, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm2 +; X32-SSE2-NEXT: shll $25, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm6 +; X32-SSE2-NEXT: shrl $7, %edx +; X32-SSE2-NEXT: movd %edx, %xmm5 +; X32-SSE2-NEXT: movswl 2(%eax), %eax +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shrl $15, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm4 +; X32-SSE2-NEXT: movdqu %xmm4, (%esp) # 16-byte Spill +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: shll $17, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm4 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; X32-SSE2-NEXT: shll $18, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; X32-SSE2-NEXT: shll $19, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm5 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X32-SSE2-NEXT: shll $20, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm6 +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: shll $21, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm1 +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X32-SSE2-NEXT: shll $22, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm3 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: movdqu (%esp), %xmm2 # 16-byte Reload +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X32-SSE2-NEXT: shll $23, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm2 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; X32-SSE2-NEXT: shll $28, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movsbl %al, %eax +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: shll $30, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: shll $31, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; X32-SSE2-NEXT: shll $26, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm5 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: shll $27, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; X32-SSE2-NEXT: shll $25, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm4 +; X32-SSE2-NEXT: shrl $7, %eax +; X32-SSE2-NEXT: movd %eax, %xmm5 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: addl $28, %esp +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pushl %esi @@ -4465,6 +5343,17 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4508,6 +5397,17 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i16_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4544,6 +5444,14 @@ define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) { ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i16_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4609,6 +5517,33 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl 2(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movswl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movswl 6(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movswl 4(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm1 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4665,6 +5600,17 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { ; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4704,6 +5650,15 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i32_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4763,6 +5718,19 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4802,6 +5770,13 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_2i8_to_i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_2i8_to_i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 @@ -4875,6 +5850,19 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pslld $31, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i1_to_4i64: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pslld $31, %xmm0 @@ -4949,6 +5937,23 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i8_to_4i64: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 @@ -5013,6 +6018,24 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: sext_32xi1_to_32xi8: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: movl %esp, %ebp +; X32-SSE2-NEXT: andl $-16, %esp +; X32-SSE2-NEXT: subl $16, %esp +; X32-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X32-SSE2-NEXT: pcmpeqw 40(%ebp), %xmm1 +; X32-SSE2-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X32-SSE2-NEXT: packsswb %xmm1, %xmm0 +; X32-SSE2-NEXT: pcmpeqw 72(%ebp), %xmm3 +; X32-SSE2-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X32-SSE2-NEXT: packsswb %xmm3, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: movl %ebp, %esp +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_32xi1_to_32xi8: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pushl %ebp @@ -5072,6 +6095,17 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) { ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_2i8_to_2i32: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: paddd %xmm0, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index d2fd68325af..096952c7c08 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -7,7 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; -; Just one 32-bit run to make sure we do reasonable things there. +; Just two 32-bit runs to make sure we do reasonable things there. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { @@ -33,6 +34,12 @@ define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 @@ -88,6 +95,15 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 @@ -178,6 +194,21 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: sext_32i8_to_32i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X32-SSE2-NEXT: psraw $8, %xmm2 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: psraw $8, %xmm3 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_32i8_to_32i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 @@ -219,6 +250,13 @@ define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 @@ -276,6 +314,16 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2 @@ -361,6 +409,22 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X32-SSE2-NEXT: psrad $24, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_16i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 @@ -408,6 +472,16 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0 @@ -479,6 +553,23 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 @@ -593,6 +684,36 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i8_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4 @@ -634,6 +755,12 @@ define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0 @@ -689,6 +816,15 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 @@ -769,6 +905,20 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_16i16_to_16i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm4 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X32-SSE2-NEXT: psrad $16, %xmm5 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_16i16_to_16i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 @@ -814,6 +964,15 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0 @@ -881,6 +1040,21 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2 @@ -986,6 +1160,32 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i16_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: psrad $16, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i16_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 @@ -1027,6 +1227,13 @@ define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i32_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 @@ -1086,6 +1293,17 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 @@ -1176,6 +1394,25 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_8i32_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_8i32_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 @@ -1250,6 +1487,22 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i1_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i1_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1305,6 +1558,19 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i8_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i8_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1441,6 +1707,30 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i1_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $28, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i1_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1494,6 +1784,15 @@ define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1644,6 +1943,33 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; X32-SSE2-NEXT: psllq $63, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; X32-SSE2-NEXT: psllq $63, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i1_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1727,6 +2053,33 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 1(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm1 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1784,6 +2137,22 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2004,6 +2373,49 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i1_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $25, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $26, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $27, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $28, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: shll $31, %eax +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i1_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2070,6 +2482,14 @@ define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) { ; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2163,6 +2583,55 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movsbl 1(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movsbl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movsbl 3(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movsbl 2(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: movsbl 5(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE2-NEXT: movsbl 4(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-SSE2-NEXT: movsbl 7(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm4 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X32-SSE2-NEXT: movsbl 6(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm5 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2398,6 +2867,53 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i1_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzbl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $6, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $5, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $4, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pslld $31, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: pslld $31, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i1_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2494,6 +3010,19 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i8_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i8_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -2909,6 +3438,97 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i1_to_16i8: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: pushl %ebx +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shrl $15, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm2 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shll $17, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: shll $18, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm1 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: shll $19, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shll $20, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: shll $21, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm6 +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: shll $22, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shll $23, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm5 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: shll $28, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm2 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movsbl %al, %eax +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; X32-SSE2-NEXT: shll $30, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X32-SSE2-NEXT: shll $31, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-SSE2-NEXT: shll $26, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-SSE2-NEXT: shll $27, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE2-NEXT: shll $25, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm1 +; X32-SSE2-NEXT: shrl $7, %eax +; X32-SSE2-NEXT: movd %eax, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -3434,6 +4054,93 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i1_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $14, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $13, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $12, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $11, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $10, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $9, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $8, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $7, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $6, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $5, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $4, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $3, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: shrl $2, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: andl $1, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: shrl %eax +; X32-SSE2-NEXT: andl $1, %eax +; X32-SSE2-NEXT: movd %eax, %xmm4 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psllw $15, %xmm0 +; X32-SSE2-NEXT: psraw $15, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: psllw $15, %xmm1 +; X32-SSE2-NEXT: psraw $15, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4289,6 +4996,179 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_32i1_to_32i8: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: pushl %ebx +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: subl $28, %esp +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl (%eax), %edx +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shrl $15, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shll $17, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm4 +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: shll $18, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm1 +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: shll $19, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm2 +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: shll $20, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm5 +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: shll $21, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm6 +; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: shll $22, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %edx, %ebp +; X32-SSE2-NEXT: shll $23, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm3 +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; X32-SSE2-NEXT: shll $28, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm0 +; X32-SSE2-NEXT: movl %edx, %edi +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-SSE2-NEXT: shll $29, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm1 +; X32-SSE2-NEXT: movl %edx, %ebx +; X32-SSE2-NEXT: movsbl %dl, %edx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: shll $30, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: shll $31, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-SSE2-NEXT: shll $26, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm7 +; X32-SSE2-NEXT: shll $27, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm2 +; X32-SSE2-NEXT: shll $25, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm6 +; X32-SSE2-NEXT: shrl $7, %edx +; X32-SSE2-NEXT: movd %edx, %xmm5 +; X32-SSE2-NEXT: movswl 2(%eax), %eax +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: shrl $15, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm4 +; X32-SSE2-NEXT: movdqu %xmm4, (%esp) # 16-byte Spill +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: shll $17, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm4 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; X32-SSE2-NEXT: shll $18, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; X32-SSE2-NEXT: shll $19, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm5 +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X32-SSE2-NEXT: shll $20, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm6 +; X32-SSE2-NEXT: movl %eax, %esi +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: shll $21, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm1 +; X32-SSE2-NEXT: movl %eax, %edi +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X32-SSE2-NEXT: shll $22, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm3 +; X32-SSE2-NEXT: movl %eax, %ebx +; X32-SSE2-NEXT: movdqu (%esp), %xmm2 # 16-byte Reload +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X32-SSE2-NEXT: shll $23, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm2 +; X32-SSE2-NEXT: movl %eax, %edx +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; X32-SSE2-NEXT: shll $28, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm7 +; X32-SSE2-NEXT: movl %eax, %ebp +; X32-SSE2-NEXT: movsbl %al, %eax +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X32-SSE2-NEXT: shll $29, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm3 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE2-NEXT: shll $30, %esi +; X32-SSE2-NEXT: sarl $31, %esi +; X32-SSE2-NEXT: movd %esi, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: shll $31, %edi +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: movd %edi, %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; X32-SSE2-NEXT: shll $26, %ebx +; X32-SSE2-NEXT: sarl $31, %ebx +; X32-SSE2-NEXT: movd %ebx, %xmm5 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: shll $27, %edx +; X32-SSE2-NEXT: sarl $31, %edx +; X32-SSE2-NEXT: movd %edx, %xmm3 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; X32-SSE2-NEXT: shll $25, %ebp +; X32-SSE2-NEXT: sarl $31, %ebp +; X32-SSE2-NEXT: movd %ebp, %xmm4 +; X32-SSE2-NEXT: shrl $7, %eax +; X32-SSE2-NEXT: movd %eax, %xmm5 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: addl $28, %esp +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pushl %esi @@ -4469,6 +5349,17 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { ; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_16i8_to_16i16: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4512,6 +5403,17 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i16_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i16_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4548,6 +5450,14 @@ define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) { ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i16_to_4i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i16_to_4i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4613,6 +5523,33 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i16_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movswl 2(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: movswl (%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm0 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movswl 6(%eax), %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: movd %ecx, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-SSE2-NEXT: movswl 4(%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm1 +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: movd %eax, %xmm3 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4669,6 +5606,17 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { ; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_8i16_to_8i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm0 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4708,6 +5656,15 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_2i32_to_2i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_2i32_to_2i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4767,6 +5724,19 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: load_sext_4i32_to_4i64: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: load_sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -4806,6 +5776,17 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_2i8_to_i32: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: pushl %eax +; X32-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: psraw $8, %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: popl %ecx +; X32-SSE2-NEXT: .cfi_def_cfa_offset 4 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_2i8_to_i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pushl %eax @@ -4883,6 +5864,19 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i1_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pslld $31, %xmm0 +; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i1_to_4i64: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pslld $31, %xmm0 @@ -4957,6 +5951,19 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512-NEXT: retq ; +; X32-SSE2-LABEL: sext_4i8_to_4i64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pslld $24, %xmm0 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_4i8_to_4i64: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pslld $24, %xmm0 @@ -5023,6 +6030,24 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; +; X32-SSE2-LABEL: sext_32xi1_to_32xi8: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: movl %esp, %ebp +; X32-SSE2-NEXT: andl $-16, %esp +; X32-SSE2-NEXT: subl $16, %esp +; X32-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X32-SSE2-NEXT: pcmpeqw 40(%ebp), %xmm1 +; X32-SSE2-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X32-SSE2-NEXT: packsswb %xmm1, %xmm0 +; X32-SSE2-NEXT: pcmpeqw 72(%ebp), %xmm3 +; X32-SSE2-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X32-SSE2-NEXT: packsswb %xmm3, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE2-NEXT: movl %ebp, %esp +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_32xi1_to_32xi8: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: pushl %ebp @@ -5080,6 +6105,18 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) { ; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; X32-SSE2-LABEL: sext_2i8_to_2i32: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-SSE2-NEXT: paddq %xmm0, %xmm0 +; X32-SSE2-NEXT: retl +; ; X32-SSE41-LABEL: sext_2i8_to_2i32: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax |

