[Loop Vectorizer] Cost-based decision for vectorization form of memory instruction.

Making the cost model selecting between Interleave, GatherScatter or Scalar vectorization form of memory instruction. The right decision should be done for non-consecutive memory access instrcuctions that may have more than one vectorization solution. This patch includes the following changes: - Cost Model calculates the cost of Load/Store vector form and choose the better option between Widening, Interleave, GatherScactter and Scalarization. Cost Model keeps the widening decision. - Arrays of Uniform and Scalar values are moved from Legality to Cost Model. - Cost Model collects Uniforms and Scalars per VF. The collection is based on CM decision map of Loadis/Stores vectorization form. - Vectorization of memory instruction is performed according to the CM decision. Differential Revision: https://reviews.llvm.org/D27919 llvm-svn: 294503
author: Elena Demikhovsky <elena.demikhovsky@intel.com> 2017-02-08 19:25:23 +0000
committer: Elena Demikhovsky <elena.demikhovsky@intel.com> 2017-02-08 19:25:23 +0000
commit: 5267edd3e390c5f5df784b991d0b57185f0686e9 (patch)
tree: c465d24b1a46e756c97ea5395f7442a3e2be0439 /llvm/test/Transforms
parent: 7a6f3914515192c74ad114ae7a82cc871e5676a5 (diff)
download: bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.tar.gz
bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.zip
3 files changed, 81 insertions, 1 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 00000000000..0ebb7a92eda
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 32bfcd2275a..820335276dc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -18,8 +18,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
 ; CHECK:     vector.body:
+; CHECK:       %index = phi i64
 ; CHECK:       %vec.ind = phi <16 x i64>
-; CHECK:       %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
+; CHECK:       %[[T0:.+]] = mul i64 %index, 5
 ; CHECK:       %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
 ; CHECK:       %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
 ; CHECK:       load <80 x float>, <80 x float>* %[[T2]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
new file mode 100644
index 00000000000..76b6cae5c3b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+
author	Elena Demikhovsky <elena.demikhovsky@intel.com>	2017-02-08 19:25:23 +0000
committer	Elena Demikhovsky <elena.demikhovsky@intel.com>	2017-02-08 19:25:23 +0000
commit	5267edd3e390c5f5df784b991d0b57185f0686e9 (patch)
tree	c465d24b1a46e756c97ea5395f7442a3e2be0439 /llvm/test/Transforms
parent	7a6f3914515192c74ad114ae7a82cc871e5676a5 (diff)
download	bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.tar.gz bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.zip