diff options
author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2017-02-08 19:25:23 +0000 |
---|---|---|
committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2017-02-08 19:25:23 +0000 |
commit | 5267edd3e390c5f5df784b991d0b57185f0686e9 (patch) | |
tree | c465d24b1a46e756c97ea5395f7442a3e2be0439 /llvm/test/Transforms | |
parent | 7a6f3914515192c74ad114ae7a82cc871e5676a5 (diff) | |
download | bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.tar.gz bcm5719-llvm-5267edd3e390c5f5df784b991d0b57185f0686e9.zip |
[Loop Vectorizer] Cost-based decision for vectorization form of memory instruction.
Making the cost model selecting between Interleave, GatherScatter or Scalar vectorization form of memory instruction.
The right decision should be done for non-consecutive memory access instrcuctions that may have more than one vectorization solution.
This patch includes the following changes:
- Cost Model calculates the cost of Load/Store vector form and choose the better option between Widening, Interleave, GatherScactter and Scalarization. Cost Model keeps the widening decision.
- Arrays of Uniform and Scalar values are moved from Legality to Cost Model.
- Cost Model collects Uniforms and Scalars per VF. The collection is based on CM decision map of Loadis/Stores vectorization form.
- Vectorization of memory instruction is performed according to the CM decision.
Differential Revision: https://reviews.llvm.org/D27919
llvm-svn: 294503
Diffstat (limited to 'llvm/test/Transforms')
3 files changed, 81 insertions, 1 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll new file mode 100644 index 00000000000..0ebb7a92eda --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 32bfcd2275a..820335276dc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -18,8 +18,9 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 ; CHECK: vector.body: +; CHECK: %index = phi i64 ; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0 +; CHECK: %[[T0:.+]] = mul i64 %index, 5 ; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] ; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* ; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll new file mode 100644 index 00000000000..76b6cae5c3b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i<N; i+=8) { +; B[i] = A[i] + 5; +; } +;} + +@A = global [10240 x i64] zeroinitializer, align 16 +@B = global [10240 x i64] zeroinitializer, align 16 + + +; CHECK_LABEL: strided_load_i64 +; CHECK: masked.gather +define void @strided_load_i64() { + br label %1 + +; <label>:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +; <label>:7: ; preds = %1 + ret void +} + |