From 1a80595a2b803949608e11d522d715a54c3a11bc Mon Sep 17 00:00:00 2001 From: Filipe Cabecinhas Date: Thu, 24 Apr 2014 00:38:14 +0000 Subject: Optimize some special cases for SSE4a insertqi Summary: Since the upper 64 bits of the destination register are undefined when performing this operation, we can substitute it and let the optimizer figure out that only a copy is needed. Also added range merging, if an instruction copies a range that can be merged with a previous copied range. Added test cases for both optimizations. Reviewers: grosbach, nadav CC: llvm-commits Differential Revision: http://reviews.llvm.org/D3357 llvm-svn: 207055 --- .../Transforms/InstCombine/vec_demanded_elts.ll | 97 ++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'llvm/test/Transforms/InstCombine') diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll index f75ea032735..70370196e15 100644 --- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -209,6 +209,103 @@ define <4 x float> @test_select(float %f, float %g) { ret <4 x float> %ret } +; We should optimize these two redundant insertqi into one +; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { +; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) +; CHECK-NOT: insertqi + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %2 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 +} + +; Test the several types of ranges and ordering that exist for two insertqi +; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind + declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { ; CHECK-LABEL: @test_vpermilvar_ps( -- cgit v1.2.3