summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp64
-rw-r--r--llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll164
2 files changed, 194 insertions, 34 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 13589418706..2a2c6409d67 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -947,6 +947,11 @@ private:
return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}
+ /// \brief Returns true if \p BB is a predicated block.
+ bool isPredicated(BasicBlock *BB) const {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+ }
+
/// \brief Returns true if LoopAccessInfo can be used for dependence queries.
bool areDependencesValid() const {
return LAI && LAI->getDepChecker().getDependences();
@@ -4925,53 +4930,38 @@ bool LoopVectorizationLegality::blockCanBePredicated(
void InterleavedAccessInfo::collectConstStridedAccesses(
MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
const ValueToValueMap &Strides) {
- // Holds load/store instructions in program order.
- SmallVector<Instruction *, 16> AccessList;
+
+ auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
// Since it's desired that the load/store instructions be maintained in
// "program order" for the interleaved access analysis, we have to visit the
// blocks in the loop in reverse postorder (i.e., in a topological order).
// Such an ordering will ensure that any load/store that may be executed
- // before a second load/store will precede the second load/store in the
- // AccessList.
+ // before a second load/store will precede the second load/store in
+ // StrideAccesses.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
for (auto &I : *BB) {
- if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!LI && !SI)
continue;
- // FIXME: Currently we can't handle mixed accesses and predicated accesses
- if (IsPred)
- return;
-
- AccessList.push_back(&I);
- }
- }
-
- if (AccessList.empty())
- return;
-
- auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
- for (auto I : AccessList) {
- auto *LI = dyn_cast<LoadInst>(I);
- auto *SI = dyn_cast<StoreInst>(I);
- Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
- int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
- const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
- PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
- uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+ const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
- // An alignment of 0 means target ABI alignment.
- unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
- if (!Align)
- Align = DL.getABITypeAlignment(PtrTy->getElementType());
+ // An alignment of 0 means target ABI alignment.
+ unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+ if (!Align)
+ Align = DL.getABITypeAlignment(PtrTy->getElementType());
- StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
- }
+ StrideAccesses[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+ }
}
// Analyze interleaved accesses and collect them into interleaved load and
@@ -5125,6 +5115,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
if (DistanceToA % static_cast<int64_t>(DesA.Size))
continue;
+ // If either A or B is in a predicated block, we prevent adding them to a
+ // group. We may be able to relax this limitation in the future once we
+ // handle more complicated blocks.
+ if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+ continue;
+
// The index of B is the index of A plus the related index to A.
int IndexB =
Group->getIndex(A) + DistanceToA / static_cast<int64_t>(DesA.Size);
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
new file mode 100644
index 00000000000..9ee6e6d529a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -0,0 +1,164 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -vectorize-num-stores-pred=1 -enable-cond-stores-vec < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+%pair = type { i64, i64 }
+
+; Ensure that we vectorize the interleaved load group even though the loop
+; contains a conditional store. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+;
+; CHECK: min.iters.checked
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK: %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK: %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK: store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK: store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+ %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+ %0 = load i64, i64* %p.1, align 8
+ %1 = icmp eq i64 %0, %x
+ br i1 %1, label %if.then, label %if.merge
+
+if.then:
+ store i64 %0, i64* %p.1, align 8
+ br label %if.merge
+
+if.merge:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; Ensure that we don't form a single interleaved group for the two loads. The
+; conditional store prevents the second load from being hoisted. The two load
+; groups are separately vectorized. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+;
+; CHECK: min.iters.checked
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK: %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK: %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK: store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK: store i64 %[[X2]], {{.*}}
+;
+; CHECK: pred.store.continue
+; CHECK: %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
+; CHECK: %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
+; CHECK: store i64 %[[X3]], {{.*}}
+; CHECK: %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
+; CHECK: store i64 %[[X4]], {{.*}}
+
+define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+ %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+ %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+ %0 = load i64, i64* %p.1, align 8
+ %1 = icmp eq i64 %0, %x
+ br i1 %1, label %if.then, label %if.merge
+
+if.then:
+ store i64 %0, i64* %p.0, align 8
+ br label %if.merge
+
+if.merge:
+ %2 = load i64, i64* %p.0, align 8
+ store i64 %2, i64 *%p.1, align 8
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; Ensure that we don't create a single interleaved group for the two stores.
+; The second store is conditional and we can't sink the first store inside the
+; predicated block. The load group is vectorized, and the store groups contain
+; gaps and are not vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+;
+; CHECK: min.iters.checked
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK: %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK: %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK: store i64 %x, {{.*}}
+; CHECK: store i64 %x, {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK: store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK: store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+ %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+ %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+ %0 = load i64, i64* %p.1, align 8
+ store i64 %x, i64* %p.0, align 8
+ %1 = icmp eq i64 %0, %x
+ br i1 %1, label %if.then, label %if.merge
+
+if.then:
+ store i64 %0, i64* %p.1, align 8
+ br label %if.merge
+
+if.merge:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
OpenPOWER on IntegriCloud