diff options
author | Craig Topper <craig.topper@intel.com> | 2018-11-18 18:11:25 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-11-18 18:11:25 +0000 |
commit | 11d50948e26261bc80a1c745422995d1f02aebb5 (patch) | |
tree | d9829b4a3b66bca40cb98bcbbc12d772b600c3ea /llvm/test | |
parent | bc8148f7b056f8cd1c6a2dfe59988b3e1ab4215d (diff) | |
download | bcm5719-llvm-11d50948e26261bc80a1c745422995d1f02aebb5.tar.gz bcm5719-llvm-11d50948e26261bc80a1c745422995d1f02aebb5.zip |
[X86] Disable combineToExtendVectorInReg under -x86-experimental-vector-widening-legalization. Add custom type legalization for extends.
If we widen illegal types instead of promoting, we should be able to rely on the type legalizer to create the vector_inreg operations for us with some caveats.
This patch disables combineToExtendVectorInReg when we are using widening.
I've enabled custom legalization for v8i8->v8i64 extends under avx512f since the type legalizer would want to create a vector_inreg with a v64i8 input type which isn't legal without avx512bw. So we go to v16i8 with custom code using the relaxation of rules we get from D54346.
I've also enable custom legalization of v8i64 and v16i32 operations with with AVX. When the input type is 128 bits, the default splitting legalization would extend first 128->256, then do the a split to two 128 pieces. Extend each half to 256 and then concat the result. The custom legalization I've added instead uses a 128->256 bit vector_inreg extend that only reads the lower 64-bits for the low half of the split. Then shuffles the high 64-bits to the low 64-bits and does another vector_inreg extend.
llvm-svn: 347172
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll | 21 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-sext-widen.ll | 56 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-zext-widen.ll | 97 |
3 files changed, 80 insertions, 94 deletions
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll index b87a15442db..13fe0f6220e 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -5435,13 +5435,14 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { ; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 24(%rdi), %rax -; SSE2-NEXT: movdqu 8(%rdi), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movq 24(%rdi), %rax ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE2-NEXT: movaps %xmm0, 16(%rax) ; SSE2-NEXT: movaps %xmm1, (%rax) @@ -5450,14 +5451,12 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { ; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq 24(%rdi), %rax -; SSE41-NEXT: movdqu 8(%rdi), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0 +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: movaps %xmm1, 16(%rax) +; SSE41-NEXT: movaps %xmm0, 16(%rax) +; SSE41-NEXT: movaps %xmm1, (%rax) ; SSE41-NEXT: retq ; ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll index d0969c6607e..44d9ee5b2f9 100644 --- a/llvm/test/CodeGen/X86/vector-sext-widen.ll +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -495,26 +495,24 @@ entry: define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_16i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 @@ -525,26 +523,24 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; SSSE3-LABEL: sext_16i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 @@ -912,8 +908,8 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: psrad $16, %xmm3 @@ -938,8 +934,8 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-zext-widen.ll b/llvm/test/CodeGen/X86/vector-zext-widen.ll index bc8896f0116..2eafc7cdaf0 100644 --- a/llvm/test/CodeGen/X86/vector-zext-widen.ll +++ b/llvm/test/CodeGen/X86/vector-zext-widen.ll @@ -123,15 +123,14 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; ; AVX1-LABEL: zext_32i8_to_32i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -398,16 +397,15 @@ entry: define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -415,15 +413,13 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; SSSE3-LABEL: zext_16i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_8i64: @@ -585,15 +581,14 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; ; AVX1-LABEL: zext_16i16_to_16i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -884,15 +879,14 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_8i32_to_8i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1162,16 +1156,15 @@ entry: define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { ; SSE2-LABEL: load_zext_8i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -1179,16 +1172,14 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { ; ; SSSE3-LABEL: load_zext_8i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i8_to_8i64: @@ -2229,11 +2220,11 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; AVX2-LABEL: zext_32i8_to_32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: retq |