diff options
| author | Anna Thomas <anna@azul.com> | 2017-04-13 18:59:25 +0000 |
|---|---|---|
| committer | Anna Thomas <anna@azul.com> | 2017-04-13 18:59:25 +0000 |
| commit | dcdb325fee27c1283855cbf47747049f3b7b46bc (patch) | |
| tree | 803f3cb419b0e2f2d078448f30f7c64b37a79387 /llvm/test/Transforms | |
| parent | 4765f17738a9c257fee80deb022cf6b82aa70d40 (diff) | |
| download | bcm5719-llvm-dcdb325fee27c1283855cbf47747049f3b7b46bc.tar.gz bcm5719-llvm-dcdb325fee27c1283855cbf47747049f3b7b46bc.zip | |
[LV] Fix the vector code generation for first order recurrence
Summary:
In first order recurrences where phi's are used outside the loop,
we should generate an additional vector.extract of the second last element from
the vectorized phi update.
This is because we require the phi itself (which is the value at the second last
iteration of the vector loop) and not the phi's update within the loop.
Also fix the code gen when we just unroll, but don't vectorize.
Fixes PR32396.
Reviewers: mssimpso, mkuper, anemet
Subscribers: llvm-commits, mzolotukhin
Differential Revision: https://reviews.llvm.org/D31979
llvm-svn: 300238
Diffstat (limited to 'llvm/test/Transforms')
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll | 26 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll | 35 |
2 files changed, 49 insertions, 12 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index e090ddf1d1a..d06e3fdba39 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -234,15 +234,27 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_phifail2( -; CHECK-NOT: load <16 x i8>, <16 x i8>* -; CHECK-NOT: add nuw nsw <16 x i32> -; CHECK-NOT: store <16 x i8> ; Function Attrs: nounwind -; FIXME: Currently, if we vectorize this loop, we will generate incorrect code -; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1], -; whereas it should be the previous value a_phi = p[len -2] +; When we vectorize this loop, we generate correct code +; even when %len exactly divides VF (since we extract from the second last index +; and pass this to the for.cond.cleanup block). Vectorized loop returns +; the correct value a_phi = p[len -2] define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +; CHECK-LABEL: @add_phifail2( +; CHECK: vector.body: +; CHECK: %wide.load = load <16 x i8>, <16 x i8>* +; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; CHECK: add i64 %index, 16 +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 +; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 +; CHECK: for.cond.cleanup: +; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 +; CHECK: ret i8 %ret entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 9155820216b..3d1c78038e3 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -350,11 +351,35 @@ for.end: ret void } -; FIXME: we can vectorize this first order recurrence, by generating two -; extracts - one for the phi `val.phi` and other for the phi update `addx`. -; val.phi at end of loop is 94 + x. -; CHECK-LABEL: extract_second_last_iteration -; CHECK-NOT: vector.body +; We vectorize this first order recurrence, by generating two +; extracts for the phi `val.phi` - one at the last index and +; another at the second last index. We need these 2 extracts because +; the first order recurrence phi is used outside the loop, so we require the phi +; itself and not its update (addx). +; UNROLL-NO-IC-LABEL: extract_second_last_iteration +; UNROLL-NO-IC: vector.body +; UNROLL-NO-IC: %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> +; UNROLL-NO-IC: %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat +; UNROLL-NO-IC: %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat +; UNROLL-NO-IC: %index.next = add i32 %index, 8 +; UNROLL-NO-IC: icmp eq i32 %index.next, 96 +; UNROLL-NO-IC: middle.block +; UNROLL-NO-IC: icmp eq i32 96, 96 +; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3 +; UNROLL-NO-IC: %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2 +; UNROLL-NO-IC: for.end +; UNROLL-NO-IC: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; Check the case when unrolled but not vectorized. +; UNROLL-NO-VF-LABEL: extract_second_last_iteration +; UNROLL-NO-VF: vector.body: +; UNROLL-NO-VF: %induction = add i32 %index, 0 +; UNROLL-NO-VF: %induction1 = add i32 %index, 1 +; UNROLL-NO-VF: %[[L1:.+]] = add i32 %induction, %x +; UNROLL-NO-VF: %[[L2:.+]] = add i32 %induction1, %x +; UNROLL-NO-VF: %index.next = add i32 %index, 2 +; UNROLL-NO-VF: icmp eq i32 %index.next, 96 +; UNROLL-NO-VF: for.end: +; UNROLL-NO-VF: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ] define i32 @extract_second_last_iteration(i32* %cval, i32 %x) { entry: br label %for.body |

