[LV] Allow interleaved accesses in loops with predicated blocks

This patch allows the formation of interleaved access groups in loops containing predicated blocks. However, the predicated accesses are prevented from forming groups. Differential Revision: https://reviews.llvm.org/D19694 llvm-svn: 275471
author: Matthew Simpson <mssimpso@codeaurora.org> 2016-07-14 20:59:47 +0000
committer: Matthew Simpson <mssimpso@codeaurora.org> 2016-07-14 20:59:47 +0000
commit: 65ca32b83ce9b69e590196d2feb331b4dcfabfef (patch)
tree: b723e775f0ed8e5b5c5efb20de336a4e5de9a7d6
parent: bbbb3ce7874f4128b57552537855206a26cd88b6 (diff)
download: bcm5719-llvm-65ca32b83ce9b69e590196d2feb331b4dcfabfef.tar.gz
bcm5719-llvm-65ca32b83ce9b69e590196d2feb331b4dcfabfef.zip
2 files changed, 194 insertions, 34 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 13589418706..2a2c6409d67 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -947,6 +947,11 @@ private:
     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
   }
 
+  /// \brief Returns true if \p BB is a predicated block.
+  bool isPredicated(BasicBlock *BB) const {
+    return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  }
+
   /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
   bool areDependencesValid() const {
     return LAI && LAI->getDepChecker().getDependences();
@@ -4925,53 +4930,38 @@ bool LoopVectorizationLegality::blockCanBePredicated(
 void InterleavedAccessInfo::collectConstStridedAccesses(
     MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
     const ValueToValueMap &Strides) {
-  // Holds load/store instructions in program order.
-  SmallVector<Instruction *, 16> AccessList;
+
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
 
   // Since it's desired that the load/store instructions be maintained in
   // "program order" for the interleaved access analysis, we have to visit the
   // blocks in the loop in reverse postorder (i.e., in a topological order).
   // Such an ordering will ensure that any load/store that may be executed
-  // before a second load/store will precede the second load/store in the
-  // AccessList.
+  // before a second load/store will precede the second load/store in
+  // StrideAccesses.
   LoopBlocksDFS DFS(TheLoop);
   DFS.perform(LI);
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
     for (auto &I : *BB) {
-      if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!LI && !SI)
         continue;
-      // FIXME: Currently we can't handle mixed accesses and predicated accesses
-      if (IsPred)
-        return;
-
-      AccessList.push_back(&I);
-    }
-  }
-
-  if (AccessList.empty())
-    return;
-
-  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-  for (auto I : AccessList) {
-    auto *LI = dyn_cast<LoadInst>(I);
-    auto *SI = dyn_cast<StoreInst>(I);
 
-    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
+      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+      int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
-    uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+      const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+      uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
-    // An alignment of 0 means target ABI alignment.
-    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+      // An alignment of 0 means target ABI alignment.
+      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
-    StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
-  }
+      StrideAccesses[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+    }
 }
 
 // Analyze interleaved accesses and collect them into interleaved load and
@@ -5125,6 +5115,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (DistanceToA % static_cast<int64_t>(DesA.Size))
         continue;
 
+      // If either A or B is in a predicated block, we prevent adding them to a
+      // group. We may be able to relax this limitation in the future once we
+      // handle more complicated blocks.
+      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+        continue;
+
       // The index of B is the index of A plus the related index to A.
       int IndexB =
           Group->getIndex(A) + DistanceToA / static_cast<int64_t>(DesA.Size);
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
new file mode 100644
index 00000000000..9ee6e6d529a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -0,0 +1,164 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -vectorize-num-stores-pred=1 -enable-cond-stores-vec < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+%pair = type { i64, i64 }
+
+; Ensure that we vectorize the interleaved load group even though the loop
+; contains a conditional store. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't form a single interleaved group for the two loads. The
+; conditional store prevents the second load from being hoisted. The two load
+; groups are separately vectorized. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+;
+; CHECK: pred.store.continue
+; CHECK:   %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
+; CHECK:   store i64 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
+; CHECK:   store i64 %[[X4]], {{.*}}
+
+define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.0, align 8
+  br label %if.merge
+
+if.merge:
+  %2 = load i64, i64* %p.0, align 8
+  store i64 %2, i64 *%p.1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't create a single interleaved group for the two stores.
+; The second store is conditional and we can't sink the first store inside the
+; predicated block. The load group is vectorized, and the store groups contain
+; gaps and are not vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+;
+; CHECK: min.iters.checked
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK:   store i64 %x, {{.*}}
+; CHECK:   store i64 %x, {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  store i64 %x, i64* %p.0, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
author	Matthew Simpson <mssimpso@codeaurora.org>	2016-07-14 20:59:47 +0000
committer	Matthew Simpson <mssimpso@codeaurora.org>	2016-07-14 20:59:47 +0000
commit	65ca32b83ce9b69e590196d2feb331b4dcfabfef (patch)
tree	b723e775f0ed8e5b5c5efb20de336a4e5de9a7d6
parent	bbbb3ce7874f4128b57552537855206a26cd88b6 (diff)
download	bcm5719-llvm-65ca32b83ce9b69e590196d2feb331b4dcfabfef.tar.gz bcm5719-llvm-65ca32b83ce9b69e590196d2feb331b4dcfabfef.zip