diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2015-03-19 22:29:40 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2015-03-19 22:29:40 +0000 |
| commit | d5c2d287f98c39968934d4a241a6c1292052e140 (patch) | |
| tree | ce319e077fa28b247f03ed4bf6ef5c570201afe0 /llvm/test/CodeGen | |
| parent | ab58a568ee6fdf7aefc20e1309f3a15d1152ee42 (diff) | |
| download | bcm5719-llvm-d5c2d287f98c39968934d4a241a6c1292052e140.tar.gz bcm5719-llvm-d5c2d287f98c39968934d4a241a6c1292052e140.zip | |
[X86, AVX] use blends instead of insert128 with index 0
Another case of x86-specific shuffle strength reduction:
avoid generating insert*128 instructions with index 0 because
they are slower than their non-lane-changing blend equivalents.
Shuffle lowering already catches most of these cases, but
the zero vector case and some other paths such as in the
modified test in vector-shuffle-256-v32.ll were getting
through.
Differential Revision: http://reviews.llvm.org/D8366
llvm-svn: 232773
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/X86/2012-04-26-sdglue.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-cast.ll | 81 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 6 |
3 files changed, 70 insertions, 21 deletions
diff --git a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll index e0b09036395..4e3f1f4a6e4 100644 --- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -5,10 +5,10 @@ ; It's hard to test for the ISEL condition because CodeGen optimizes ; away the bugpointed code. Just ensure the basics are still there. ;CHECK-LABEL: func: -;CHECK: vpxor -;CHECK: vinserti128 +;CHECK: vxorps ;CHECK: vpshufd ;CHECK: vpbroadcastd +;CHECK: vinserti128 ;CHECK: vmulps ;CHECK: vmulps ;CHECK: ret diff --git a/llvm/test/CodeGen/X86/avx-cast.ll b/llvm/test/CodeGen/X86/avx-cast.ll index f1233b0251c..b4798f15945 100644 --- a/llvm/test/CodeGen/X86/avx-cast.ll +++ b/llvm/test/CodeGen/X86/avx-cast.ll @@ -1,51 +1,100 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 + +; Prefer a blend instruction to a vinsert128 instruction because blends +; are simpler (no lane changes) and therefore will have equal or better +; performance. -; CHECK-LABEL: castA: -; CHECK: vxorps -; CHECK-NEXT: vinsertf128 $0 define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castA: +; AVX1: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: castA: +; AVX2: vxorps %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq + entry: %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> ret <8 x float> %shuffle.i } -; CHECK-LABEL: castB: -; CHECK: vxorps -; CHECK-NEXT: vinsertf128 $0 define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castB: +; AVX1: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: castB: +; AVX2: vxorpd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX2-NEXT: retq + entry: %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2> ret <4 x double> %shuffle.i } -; CHECK-LABEL: castC: -; CHECK: vxorps -; CHECK-NEXT: vinsertf128 $0 +; AVX2 is needed for integer types. + define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castC: +; AVX1: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: castC: +; AVX2: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq + entry: %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2> ret <4 x i64> %shuffle.i } -; CHECK-LABEL: castD: -; CHECK-NOT: vextractf128 $0 +; The next three tests don't need any shuffling. There may or may not be a +; vzeroupper before the return, so just check for the absence of shuffles. + define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castD: +; AVX1-NOT: extract +; AVX1-NOT: blend +; +; AVX2-LABEL: castD: +; AVX2-NOT: extract +; AVX2-NOT: blend + entry: %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x float> %shuffle.i } -; CHECK-LABEL: castE: -; CHECK-NOT: vextractf128 $0 define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castE: +; AVX1-NOT: extract +; AVX1-NOT: blend +; +; AVX2-LABEL: castE: +; AVX2-NOT: extract +; AVX2-NOT: blend + entry: %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1> ret <2 x i64> %shuffle.i } -; CHECK-LABEL: castF: -; CHECK-NOT: vextractf128 $0 define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp { +; AVX1-LABEL: castF: +; AVX1-NOT: extract +; AVX1-NOT: blend +; +; AVX2-LABEL: castF: +; AVX2-NOT: extract +; AVX2-NOT: blend + entry: %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1> ret <2 x double> %shuffle.i diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index ed3c66637ed..f9f4b96be3c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -652,12 +652,12 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd $15, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> |

