summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp15
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp34
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll26
-rw-r--r--llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll35
4 files changed, 74 insertions, 36 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 444bc16e0a1..175d013a011 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -553,22 +553,13 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
return false;
+ // Ensure every user of the phi node is dominated by the previous value.
+ // The dominance requirement ensures the loop vectorizer will not need to
+ // vectorize the initial value prior to the first iteration of the loop.
for (User *U : Phi->users())
if (auto *I = dyn_cast<Instruction>(U)) {
- // Ensure every user of the phi node is dominated by the previous value.
- // The dominance requirement ensures the loop vectorizer will not need to
- // vectorize the initial value prior to the first iteration of the loop.
if (!DT->dominates(Previous, I))
return false;
- // When the phi node has users outside the loop, the current logic for
- // fixFirstOrderRecurrences may generate incorrect code. Specifically, we
- // extract the last element from the vectorized phi, which would be the
- // update to the phi before exiting the loop. However, what we want is the
- // previous phi value before the update (i.e. the second last update
- // before end of the vectorized loop).
- // See added test cases in first-order-recurrence.ll
- if (!TheLoop->contains(I))
- return false;
}
return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab62a5d9516..e080fee4ae4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4078,24 +4078,34 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
// Extract the last vector element in the middle block. This will be the
- // initial value for the recurrence when jumping to the scalar loop.
- // FIXME: Note that the last vector element need not always be the correct one:
- // consider a loop where we have phi uses outside the loop - we need the
- // second last iteration value and not the last one). For now, we avoid
- // considering such cases as firstOrderRecurrences (see
- // isFirstOrderRecurrence).
- auto *Extract = Incoming;
+ // initial value for the recurrence when jumping to the scalar loop.
+ auto *ExtractForScalar = Incoming;
if (VF > 1) {
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
- "vector.recur.extract");
- }
+ ExtractForScalar = Builder.CreateExtractElement(
+ ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+ }
+ // Extract the second last element in the middle block if the
+ // Phi is used outside the loop. We need to extract the phi itself
+ // and not the last element (the phi update in the current iteration). This
+ // will be the value when jumping to the exit block from the LoopMiddleBlock,
+ // when the scalar loop is not run at all.
+ Value *ExtractForPhiUsedOutsideLoop = nullptr;
+ if (VF > 1)
+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+ Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+ // `Incoming`. This is analogous to the vectorized case above: extracting the
+ // second last element when VF > 1.
+ else if (UF > 1)
+ ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2];
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
for (auto *BB : predecessors(LoopScalarPreHeader)) {
- auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
+ auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
Start->addIncoming(Incoming, BB);
}
@@ -4112,7 +4122,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
if (!LCSSAPhi)
break;
if (LCSSAPhi->getIncomingValue(0) == Phi) {
- LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
+ LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
break;
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index e090ddf1d1a..d06e3fdba39 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -234,15 +234,27 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
-; CHECK-LABEL: @add_phifail2(
-; CHECK-NOT: load <16 x i8>, <16 x i8>*
-; CHECK-NOT: add nuw nsw <16 x i32>
-; CHECK-NOT: store <16 x i8>
; Function Attrs: nounwind
-; FIXME: Currently, if we vectorize this loop, we will generate incorrect code
-; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1],
-; whereas it should be the previous value a_phi = p[len -2]
+; When we vectorize this loop, we generate correct code
+; even when %len exactly divides VF (since we extract from the second last index
+; and pass this to the for.cond.cleanup block). Vectorized loop returns
+; the correct value a_phi = p[len -2]
define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+; CHECK-LABEL: @add_phifail2(
+; CHECK: vector.body:
+; CHECK: %wide.load = load <16 x i8>, <16 x i8>*
+; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
+; CHECK: add nuw nsw <16 x i32>
+; CHECK: store <16 x i8>
+; CHECK: add i64 %index, 16
+; CHECK: icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
+; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
+; CHECK: for.cond.cleanup:
+; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK: %ret = trunc i32 %a_phi.lcssa to i8
+; CHECK: ret i8 %ret
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 9155820216b..3d1c78038e3 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1,6 +1,7 @@
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -350,11 +351,35 @@ for.end:
ret void
}
-; FIXME: we can vectorize this first order recurrence, by generating two
-; extracts - one for the phi `val.phi` and other for the phi update `addx`.
-; val.phi at end of loop is 94 + x.
-; CHECK-LABEL: extract_second_last_iteration
-; CHECK-NOT: vector.body
+; We vectorize this first order recurrence, by generating two
+; extracts for the phi `val.phi` - one at the last index and
+; another at the second last index. We need these 2 extracts because
+; the first order recurrence phi is used outside the loop, so we require the phi
+; itself and not its update (addx).
+; UNROLL-NO-IC-LABEL: extract_second_last_iteration
+; UNROLL-NO-IC: vector.body
+; UNROLL-NO-IC: %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC: %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat
+; UNROLL-NO-IC: %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat
+; UNROLL-NO-IC: %index.next = add i32 %index, 8
+; UNROLL-NO-IC: icmp eq i32 %index.next, 96
+; UNROLL-NO-IC: middle.block
+; UNROLL-NO-IC: icmp eq i32 96, 96
+; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3
+; UNROLL-NO-IC: %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2
+; UNROLL-NO-IC: for.end
+; UNROLL-NO-IC: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; Check the case when unrolled but not vectorized.
+; UNROLL-NO-VF-LABEL: extract_second_last_iteration
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF: %induction = add i32 %index, 0
+; UNROLL-NO-VF: %induction1 = add i32 %index, 1
+; UNROLL-NO-VF: %[[L1:.+]] = add i32 %induction, %x
+; UNROLL-NO-VF: %[[L2:.+]] = add i32 %induction1, %x
+; UNROLL-NO-VF: %index.next = add i32 %index, 2
+; UNROLL-NO-VF: icmp eq i32 %index.next, 96
+; UNROLL-NO-VF: for.end:
+; UNROLL-NO-VF: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ]
define i32 @extract_second_last_iteration(i32* %cval, i32 %x) {
entry:
br label %for.body
OpenPOWER on IntegriCloud