diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-04-16 17:52:13 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-04-16 17:52:13 +0000 |
commit | c86867cd5f920ae4edbdc0c657025b3071d21f43 (patch) | |
tree | ab3490e95b75b3a7c4b4da80d59bc0a3579891ba /llvm/test/Transforms | |
parent | c2368aafbc31a33bc9c9757c3d00c44dd3d8ced3 (diff) | |
download | bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.tar.gz bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.zip |
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles
This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics
as shufflevectors, we can optimize them better.
I've left all but the full zero case of the zero mask variants out of this patch.
I don't think those can be converted into a single shuffle in all cases, but I'd
be happy to be proven wrong as I was for vperm2f128.
Either way, we'd need to support whatever sequence we come up with for those cases
in the backend before converting them here.
Differential Revision: http://reviews.llvm.org/D8833
llvm-svn: 235124
Diffstat (limited to 'llvm/test/Transforms')
-rw-r--r-- | llvm/test/Transforms/InstCombine/x86-insertps.ll | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/llvm/test/Transforms/InstCombine/x86-insertps.ll b/llvm/test/Transforms/InstCombine/x86-insertps.ll new file mode 100644 index 00000000000..487c7274ca1 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/x86-insertps.ll @@ -0,0 +1,117 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone + +; This should never happen, but make sure we don't crash handling a non-constant immediate byte. + +define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_non_const_imm +; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) +; CHECK-NEXT: ret <4 x float> +} + +; If all zero mask bits are set, return a zero regardless of the other control bits. + +define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x0f +; CHECK-NEXT: ret <4 x float> zeroinitializer +} +define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xff +; CHECK-NEXT: ret <4 x float> zeroinitializer +} + +; If some zero mask bits are set, we do not change anything. + +define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x03 +; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) +; CHECK-NEXT: ret <4 x float> +} + +; If no zero mask bits are set, convert to a shuffle. + +define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x00 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x10 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x20 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x30 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xc0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xd0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xe0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xf0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7> +; CHECK-NEXT: ret <4 x float> +} + |