summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorFilipe Cabecinhas <me@filcab.net>2014-05-08 00:25:16 +0000
committerFilipe Cabecinhas <me@filcab.net>2014-05-08 00:25:16 +0000
commit095d9d573a62ed5cde005b13e7130c25ae749f05 (patch)
tree7456deda73a03eaac0d05ad0e41358de9f049f2d /llvm/test
parent93914a9518797b9c123ee68a0be313d6dc4f6b93 (diff)
downloadbcm5719-llvm-095d9d573a62ed5cde005b13e7130c25ae749f05.tar.gz
bcm5719-llvm-095d9d573a62ed5cde005b13e7130c25ae749f05.zip
Lower certain build_vectors to insertps instructions
Summary: Vectors built with zeros and elements in the same order as another (source) vector are optimized to be built using a single insertps instruction. Also optimize when we move one element in a vector to a different place in that vector while zeroing out some of the other elements. Further optimizations are possible, described in TODO comments. I will be implementing at least some of them in the near future. Added some tests for different cases where this optimization triggers. Reviewers: nadav, delena, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3521 llvm-svn: 208271
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/sse41.ll256
1 files changed, 256 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 8db97d9071a..db0d9c5c116 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -320,3 +320,259 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %result
}
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $8
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext3 = extractelement <4 x float> %x, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $12
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $104
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps $232
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext2 = extractelement <4 x float> %x, i32 3
+ %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $198
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 3
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+ %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+ %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps $176
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+ %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $8
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecext3 = extractelement <4 x i32> %x, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+ %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+ ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $12
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $104
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+ %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+ ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $232
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecext2 = extractelement <4 x i32> %x, i32 3
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $198
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 3
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+ %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+ %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps $176
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+ %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps %xmm0, %xmm1
+; CHECK-NEXT: insertps $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps %xmm1, %xmm0
+; CHECK-NEXT: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext3 = extractelement <4 x float> %x, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ %mask = fcmp olt <4 x float> %vecinit5, %x
+ %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+ ret <4 x float> %res
+}
OpenPOWER on IntegriCloud