[X86, SSE] instcombine common cases of insertps intrinsics into shuffles

This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 llvm-svn: 235124
author: Sanjay Patel <spatel@rotateright.com> 2015-04-16 17:52:13 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2015-04-16 17:52:13 +0000
commit: c86867cd5f920ae4edbdc0c657025b3071d21f43 (patch)
tree: ab3490e95b75b3a7c4b4da80d59bc0a3579891ba /llvm/test/Transforms
parent: c2368aafbc31a33bc9c9757c3d00c44dd3d8ced3 (diff)
download: bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.tar.gz
bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.zip
1 files changed, 117 insertions, 0 deletions
diff --git a/llvm/test/Transforms/InstCombine/x86-insertps.ll b/llvm/test/Transforms/InstCombine/x86-insertps.ll
new file mode 100644
index 00000000000..487c7274ca1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/x86-insertps.ll
@@ -0,0 +1,117 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
+
+define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_non_const_imm
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; If all zero mask bits are set, return a zero regardless of the other control bits.
+
+define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x0f
+; CHECK-NEXT:  ret <4 x float> zeroinitializer
+}
+define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xff
+; CHECK-NEXT:  ret <4 x float> zeroinitializer
+}
+
+; If some zero mask bits are set, we do not change anything.
+
+define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x03
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; If no zero mask bits are set, convert to a shuffle.
+
+define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x00
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x10
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x20
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x30
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc0
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xd0
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xe0
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xf0
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:  ret <4 x float>
+}
+
author	Sanjay Patel <spatel@rotateright.com>	2015-04-16 17:52:13 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2015-04-16 17:52:13 +0000
commit	c86867cd5f920ae4edbdc0c657025b3071d21f43 (patch)
tree	ab3490e95b75b3a7c4b4da80d59bc0a3579891ba /llvm/test/Transforms
parent	c2368aafbc31a33bc9c9757c3d00c44dd3d8ced3 (diff)
download	bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.tar.gz bcm5719-llvm-c86867cd5f920ae4edbdc0c657025b3071d21f43.zip