diff options
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-vec-shuffle-3.ll | 380 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 1001 |
2 files changed, 1001 insertions, 380 deletions
diff --git a/llvm/test/CodeGen/X86/combine-vec-shuffle-3.ll b/llvm/test/CodeGen/X86/combine-vec-shuffle-3.ll deleted file mode 100644 index bd2d34ca189..00000000000 --- a/llvm/test/CodeGen/X86/combine-vec-shuffle-3.ll +++ /dev/null @@ -1,380 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test1 -; Mask: [0,1,2,3] -; CHECK: movaps -; CHECK: ret - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test2 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test3 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test4 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test5 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [4,5,6,7] -; CHECK: movaps -; CHECK: ret - -define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [4,1,6,7] -; CHECK: blendps -; CHECK: ret - -define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test11 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test13 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test14 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test15 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test16 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test17 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test18 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test19 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test20 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; Check some negative cases. -define <4 x float> @test1b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> - ret <4 x float> %2 -} -; CHECK-LABEL: test1b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test2b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test2b -; CHECK: shufps -; CHECK: pshufd -; CHECK: ret - -define <4 x float> @test3b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test3b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test4b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test4b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - - -; Verify that we correctly fold shuffles even when we use illegal vector types. -define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x i8> %2 -} -; CHECK-LABEL: test1c -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK-NEXT: ret - -define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x i8> %2 -} -; CHECK-LABEL: test2c -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i8> %2 -} -; CHECK-LABEL: test3c -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i8> %2 -} -; CHECK-LABEL: test4c -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; The following test cases are generated from this C++ code -; -;__m128 blend_01(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<1 ); -; return s; -;} -; -;__m128 blend_02(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; return s; -;} -; -;__m128 blend_123(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<1 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; s = _mm_blend_ps( s, b, 1<<3 ); -; return s; -;} - -; Ideally, we should collapse the following shuffles into a single one. - -define <4 x float> @blend_01(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_01 -; CHECK: movsd -; CHECK-NEXT: ret - -define <4 x float> @blend_02(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_02 -; CHECK: blendps $5 -; CHECK-NEXT: ret - -define <4 x float> @blend_123(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> - %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x float> %shuffle12 -} -; CHECK-LABEL: blend_123 -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test_movhl_1(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_1 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_2(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_2 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_3(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_3 -; CHECK: movhlps -; CHECK-NEXT: ret - diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index d8e6cf2b8c1..dd0961769e7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1097,3 +1097,1004 @@ define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> ret <4 x i32> %2 } + +define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test4: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test4: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test5: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test6: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test6: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test6: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test6: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test7: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test7: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test7: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test8: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test8: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test9: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test9: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test10: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test10: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test10: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test10: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { +; ALL-LABEL: combine_test11: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test12: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test12: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test12: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test12: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test13: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test13: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test14: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test14: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { +; ALL-LABEL: combine_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test17: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test17: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test18: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test18: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test19: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test19: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test20: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test20: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test20: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test20: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + + +; Check some negative cases. +; FIXME: Do any of these really make sense? Are they redundant with the above tests? + +define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3b: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3b: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test4b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test4b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[0,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + + +; Verify that we correctly fold shuffles even when we use illegal vector types. + +define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test1c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: movss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test2c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test3c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test3c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test4c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test4c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i8> %2 +} + + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_01: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_01: +; SSE41: # BB#0: +; SSE41-NEXT: movsd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_01: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_02: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_02: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_02: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_02: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_123: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_123: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_123: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_123: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle12 +} + +define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_1: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_1: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_2: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_2: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_3: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_3: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> + ret <4 x i32> %2 +} |

