diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-03-24 22:39:29 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-03-24 22:39:29 +0000 |
commit | e304bea010879945bf63eacb0b0e2f2d7974fbd8 (patch) | |
tree | 4151f7bc482fb77eedabdbaf68a8ae59cc188df7 | |
parent | 68d535c45f207b8749b6dc2383a05a6e39d13738 (diff) | |
download | bcm5719-llvm-e304bea010879945bf63eacb0b0e2f2d7974fbd8.tar.gz bcm5719-llvm-e304bea010879945bf63eacb0b0e2f2d7974fbd8.zip |
optimize the AVX2 (integer) version of vperm2 into a shuffle
...because this is what happens when an instruction
set puts its underwear on after its pants.
This is an extension of r232852, r233100, and 233110:
http://llvm.org/viewvc/llvm-project?view=revision&revision=232852
http://llvm.org/viewvc/llvm-project?view=revision&revision=233100
http://llvm.org/viewvc/llvm-project?view=revision&revision=233110
llvm-svn: 233127
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 | ||||
-rw-r--r-- | llvm/test/Transforms/InstCombine/x86-vperm2.ll | 34 |
2 files changed, 34 insertions, 2 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8f7825a8664..21243c229c6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -972,7 +972,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx_vperm2f128_pd_256: case Intrinsic::x86_avx_vperm2f128_ps_256: case Intrinsic::x86_avx_vperm2f128_si_256: - // TODO: Add the AVX2 version of this instruction. + case Intrinsic::x86_avx2_vperm2i128: if (Value *V = SimplifyX86vperm2(*II, *Builder)) return ReplaceInstUsesWith(*II, V); break; diff --git a/llvm/test/Transforms/InstCombine/x86-vperm2.ll b/llvm/test/Transforms/InstCombine/x86-vperm2.ll index 864296dd189..5c60852dddb 100644 --- a/llvm/test/Transforms/InstCombine/x86-vperm2.ll +++ b/llvm/test/Transforms/InstCombine/x86-vperm2.ll @@ -12,7 +12,7 @@ define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i } -; In the following 3 tests, both zero mask bits of the immediate are set. +; In the following 4 tests, both zero mask bits of the immediate are set. define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) { %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136) @@ -38,6 +38,14 @@ define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: ret <8 x i32> zeroinitializer } +define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) { + %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136) + ret <4 x i64> %res + +; CHECK-LABEL: @perm2i_0x88 +; CHECK-NEXT: ret <4 x i64> zeroinitializer +} + ; The other control bits are ignored when zero mask bits of the immediate are set. @@ -207,6 +215,18 @@ define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) { } +; Confirm that the AVX2 version works the same. + +define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) { + %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51) + ret <4 x i64> %res + +; CHECK-LABEL: @perm2i_0x33 +; CHECK-NEXT: %1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> +; CHECK-NEXT: ret <4 x i64> %1 +} + + ; Confirm that when a single zero mask bit is set, we replace a source vector with zeros. define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) { @@ -245,7 +265,19 @@ define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) { ; CHECK-NEXT: ret <4 x double> } +; Check one more with the AVX2 version. + +define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) { + %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40) + ret <4 x i64> %res + +; CHECK-LABEL: @perm2i_0x28 +; CHECK-NEXT: shufflevector <4 x i64> <i64 0{{.*}}, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> +; CHECK-NEXT: ret <4 x i64> +} + declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone |