[X86][SSE] Improved (v)insertps shuffle matching

In the current code we only attempt to match against insertps if we have exactly one element from the second input vector, irrespective of how much of the shuffle result is zeroable. This patch checks to see if there is a single non-zeroable element from either input that requires insertion. It also supports matching of cases where only one of the inputs need to be referenced. We also split insertps shuffle matching off into a new lowerVectorShuffleAsInsertPS function. Differential Revision: http://reviews.llvm.org/D6879 llvm-svn: 225589
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-01-10 19:45:33 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-01-10 19:45:33 +0000
commit: 94a4cc027ab111f6055c81605b3a47caae46cbe6 (patch)
tree: 74d042f49c41355e385be63621f7e5df8d2e771a /llvm/test
parent: 9be98b6bef38904c95c1508aee3e901620681388 (diff)
download: bcm5719-llvm-94a4cc027ab111f6055c81605b3a47caae46cbe6.tar.gz
bcm5719-llvm-94a4cc027ab111f6055c81605b3a47caae46cbe6.zip
3 files changed, 29 insertions, 19 deletions
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index c70067f389a..280fcbc7a3a 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -240,12 +240,10 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test19:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    xorps %xmm3, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
-; CHECK-NEXT:    orps %xmm3, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; CHECK-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2]
+; CHECK-NEXT:    orps %xmm1, %xmm2
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll
index cce2d909120..726d7125a63 100644
--- a/llvm/test/CodeGen/X86/masked_memop.ll
+++ b/llvm/test/CodeGen/X86/masked_memop.ll
@@ -71,7 +71,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float
 ; AVX2-LABEL: test5
 ; AVX2: vmaskmovpd
 ; AVX2: vblendvpd
-; AVX2: vmaskmovpd  
+; AVX2: vmaskmovpd
 ; AVX2: vblendvpd
 define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -150,7 +150,7 @@ define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val)
 }
 
 ; AVX2-LABEL: test14
-; AVX2: vshufps $-24
+; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2: vmaskmovps
 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -194,7 +194,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
 }
 
 
-declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) 
+declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
@@ -202,8 +202,8 @@ declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) 
-declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) 
+declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 897a69a5496..4e2bf87fdf6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -553,18 +553,30 @@ define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test3c:
-; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm1, %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test3c:
+; SSE2:      # BB#0:
+; SSE2-NEXT:   xorps %xmm1, %xmm0
+; SSE2-NEXT:   xorps %xmm1, %xmm1
+; SSE2-NEXT:   shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:   retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3c:
+; SSSE3:      # BB#0:
+; SSSE3-NEXT:   xorps %xmm1, %xmm0
+; SSSE3-NEXT:   xorps %xmm1, %xmm1
+; SSSE3-NEXT:   shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:   retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3c:
+; SSE41:      # BB#0:
+; SSE41-NEXT:   xorps %xmm1, %xmm0
+; SSE41-NEXT:   insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; SSE41-NEXT:   retq
 ;
 ; AVX-LABEL: combine_bitwise_ops_test3c:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-01-10 19:45:33 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-01-10 19:45:33 +0000
commit	94a4cc027ab111f6055c81605b3a47caae46cbe6 (patch)
tree	74d042f49c41355e385be63621f7e5df8d2e771a /llvm/test
parent	9be98b6bef38904c95c1508aee3e901620681388 (diff)
download	bcm5719-llvm-94a4cc027ab111f6055c81605b3a47caae46cbe6.tar.gz bcm5719-llvm-94a4cc027ab111f6055c81605b3a47caae46cbe6.zip