diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-02-16 14:57:25 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-02-16 14:57:25 +0000 |
| commit | 4e2f757dc100a2bf75b8dc9d6302ebd453b3cd2c (patch) | |
| tree | ff6b7eb3ef360a1bc57e49d9cb589a9e4ab4fc52 | |
| parent | 6cf41b028d5deaee192d8e9195af2b9a80dea3ed (diff) | |
| download | bcm5719-llvm-4e2f757dc100a2bf75b8dc9d6302ebd453b3cd2c.tar.gz bcm5719-llvm-4e2f757dc100a2bf75b8dc9d6302ebd453b3cd2c.zip | |
[X86][SSE] Allow float domain crossing if we are merging 2 or more shuffles and the root started as a float domain shuffle
llvm-svn: 325349
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-sra.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/oddshuffles.ll | 37 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pmul.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc.ll | 18 |
6 files changed, 27 insertions, 43 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6df80725002..d90b1397074 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28747,6 +28747,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || + (RootVT.isFloatingPoint() && Depth >= 2) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 871fa31df30..961ec2be59e 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -686,7 +686,7 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind { ; SSE2-LABEL: _clearupper4xi64b: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT: movaps {{.*#+}} xmm2 ; SSE2-NEXT: andps %xmm2, %xmm0 ; SSE2-NEXT: andps %xmm2, %xmm1 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index 82c39377d06..a8226e435ef 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -197,9 +197,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_lshr: ; SSE: # %bb.0: -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 1fd4e0b0214..05cbf58d794 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -614,18 +614,17 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movaps %xmm2, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: movaps %xmm0, 32(%rdi) ; SSE2-NEXT: movaps %xmm4, 16(%rdi) @@ -1562,39 +1561,37 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; SSE2-NEXT: movdqu 16(%rsi), %xmm2 ; SSE2-NEXT: movdqu (%rdx), %xmm6 ; SSE2-NEXT: movdqu 16(%rdx), %xmm1 -; SSE2-NEXT: movdqu (%rcx), %xmm7 -; SSE2-NEXT: movdqu 16(%rcx), %xmm4 +; SSE2-NEXT: movups (%rcx), %xmm7 +; SSE2-NEXT: movups 16(%rcx), %xmm4 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,0] +; SSE2-NEXT: movaps %xmm7, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: movaps %xmm7, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm5[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[3,0] +; SSE2-NEXT: movaps %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm6[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2] -; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: movaps %xmm4, %xmm7 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE2-NEXT: movups %xmm2, 80(%rdi) ; SSE2-NEXT: movups %xmm7, 64(%rdi) diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 9890578101b..f6f466a0cbf 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1261,11 +1261,9 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE2-NEXT: psrlq $32, %xmm2 ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 ; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: psllq $32, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] ; SSE2-NEXT: movaps %xmm3, %xmm0 ; SSE2-NEXT: retq ; @@ -1276,11 +1274,9 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: psrlq $32, %xmm1 ; SSE41-NEXT: pmuludq %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm2 ; SSE41-NEXT: pmuludq %xmm3, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64_zero_lower: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 6f6cc51fd5f..dca9ca452af 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -101,25 +101,17 @@ entry: define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { ; SSE-LABEL: trunc8i64_8i32_lshr: ; SSE: # %bb.0: # %entry -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i32_lshr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; |

