Fix SLPVectorizer commutativity reordering

The SLPVectorizer had a very crude way of trying to benefit from associativity: it tried to optimize for splat/broadcast or in order to have the same operator on the same side. This is benefitial to the cost model and allows more vectorization to occur. This patch improve the logic and make the detection optimal (locally, we don't look at the full tree but only at the immediate children). Should fix https://llvm.org/bugs/show_bug.cgi?id=25247 Reviewers: mzolotukhin Differential Revision: http://reviews.llvm.org/D13996 From: Mehdi Amini <mehdi.amini@apple.com> llvm-svn: 252337
author: Mehdi Amini <mehdi.amini@apple.com> 2015-11-06 20:17:51 +0000
committer: Mehdi Amini <mehdi.amini@apple.com> 2015-11-06 20:17:51 +0000
commit: b0e3192a485f0db4c543aed262b125bb029ca62d (patch)
tree: e8c7ae1f600c16b6217113da9a3823a8bae5b1ee /llvm/test
parent: 7bcaec83bef146a4f3d5533f7eb584e795e8348d (diff)
download: bcm5719-llvm-b0e3192a485f0db4c543aed262b125bb029ca62d.tar.gz
bcm5719-llvm-b0e3192a485f0db4c543aed262b125bb029ca62d.zip
1 files changed, 78 insertions, 0 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
new file mode 100644
index 00000000000..2798ccb15e4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -0,0 +1,78 @@
+; RUN: opt -slp-vectorizer < %s -S | FileCheck %s
+
+; Verify that the SLP vectorizer is able to figure out that commutativity
+; offers the possibility to splat/broadcast %c and thus make it profitable
+; to vectorize this case
+
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+@cle = external unnamed_addr global [32 x i8], align 16
+@cle32 = external unnamed_addr global [32 x i32], align 16
+
+
+; Check that we correctly detect a splat/broadcast by leveraging the
+; commutativity property of `xor`.
+
+; CHECK-LABEL:  @splat
+; CHECK:  store <16 x i8>
+define void @splat(i8 %a, i8 %b, i8 %c) {
+  %1 = xor i8 %c, %a
+  store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
+  %2 = xor i8 %a, %c
+  store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
+  %3 = xor i8 %a, %c
+  store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
+  %4 = xor i8 %a, %c
+  store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
+  %5 = xor i8 %c, %a
+  store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
+  %6 = xor i8 %c, %b
+  store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
+  %7 = xor i8 %c, %a
+  store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
+  %8 = xor i8 %c, %b
+  store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
+  %9 = xor i8 %a, %c
+  store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
+  %10 = xor i8 %a, %c
+  store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
+  %11 = xor i8 %a, %c
+  store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
+  %12 = xor i8 %a, %c
+  store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
+  %13 = xor i8 %a, %c
+  store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
+  %14 = xor i8 %a, %c
+  store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
+  %15 = xor i8 %a, %c
+  store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
+  %16 = xor i8 %a, %c
+  store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
+  ret void
+}
+
+
+
+; Check that we correctly detect that we can have the same opcode on one side by
+; leveraging the commutativity property of `xor`.
+
+; CHECK-LABEL:  @same_opcode_on_one_side
+; CHECK:  store <4 x i32>
+define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
+  %add1 = add i32 %c, %a
+  %add2 = add i32 %c, %a
+  %add3 = add i32 %a, %c
+  %add4 = add i32 %c, %a
+  %1 = xor i32 %add1, %a
+  store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
+  %2 = xor i32 %b, %add2
+  store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
+  %3 = xor i32 %c, %add3
+  store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
+  %4 = xor i32 %a, %add4
+  store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
+  ret void
+}
author	Mehdi Amini <mehdi.amini@apple.com>	2015-11-06 20:17:51 +0000
committer	Mehdi Amini <mehdi.amini@apple.com>	2015-11-06 20:17:51 +0000
commit	b0e3192a485f0db4c543aed262b125bb029ca62d (patch)
tree	e8c7ae1f600c16b6217113da9a3823a8bae5b1ee /llvm/test
parent	7bcaec83bef146a4f3d5533f7eb584e795e8348d (diff)
download	bcm5719-llvm-b0e3192a485f0db4c543aed262b125bb029ca62d.tar.gz bcm5719-llvm-b0e3192a485f0db4c543aed262b125bb029ca62d.zip