diff options
| author | Chandler Carruth <chandlerc@gmail.com> | 2015-02-16 12:28:18 +0000 |
|---|---|---|
| committer | Chandler Carruth <chandlerc@gmail.com> | 2015-02-16 12:28:18 +0000 |
| commit | 1e57e2deb84aa812e15a09bdced71682d814e538 (patch) | |
| tree | a19358c851f4d06b86ad756d341606f035a68ac4 /llvm/test/CodeGen/X86/vector-shuffle-combining.ll | |
| parent | 50dc783d75247022bdba75eb8408cdff2149cfae (diff) | |
| download | bcm5719-llvm-1e57e2deb84aa812e15a09bdced71682d814e538.tar.gz bcm5719-llvm-1e57e2deb84aa812e15a09bdced71682d814e538.zip | |
[x86] Add a generic unpack-targeted lowering technique. This can be used
to generically lower blends and is particularly nice because it is
available frome SSE2 onward. This removes a lot of the remaining domain
crossing blends in SSE2 code.
I'm hoping to replace some of the "interleaved" lowering hacks with
something closer to this which should be more principled. First, this
needs to learn how to detect and use other interleavings besides that of
the natural type provided. That will be a follow-up patch though.
llvm-svn: 229378
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-shuffle-combining.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 82 |
1 files changed, 44 insertions, 38 deletions
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 048b8b40649..64952f19d2d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -275,16 +275,18 @@ define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test1b: ; SSE2: # BB#0: -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test1b: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test1b: @@ -313,16 +315,18 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test2b: ; SSE2: # BB#0: -; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test2b: ; SSSE3: # BB#0: -; SSSE3-NEXT: orps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test2b: @@ -390,18 +394,18 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test4b: ; SSE2: # BB#0: -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test4b: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test4b: @@ -430,18 +434,18 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test5b: ; SSE2: # BB#0: -; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test5b: ; SSSE3: # BB#0: -; SSSE3-NEXT: orps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test5b: @@ -1012,14 +1016,16 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: combine_nested_undef_test16: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_nested_undef_test16: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_nested_undef_test16: @@ -1171,16 +1177,16 @@ define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: combine_nested_undef_test21: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_nested_undef_test21: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_nested_undef_test21: |

