3 files changed, 58 insertions, 6 deletions
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 13aa0f53ca5..bcf4e5d52f7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -663,7 +663,8 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// The \p Assume parameter indicates if we are allowed to make additional
 /// run-time assumptions.
 int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
-                 const ValueToValueMap &StridesMap, bool Assume = false);
+                 const ValueToValueMap &StridesMap = ValueToValueMap(),
+                 bool Assume = false);
 
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index c6cec12133a..e4419113d45 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -61,7 +61,8 @@ struct StoreToLoadForwardingCandidate {
 
   /// \brief Return true if the dependence from the store to the load has a
   /// distance of one.  E.g. A[i+1] = A[i]
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+                                 Loop *L) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadPtrType = LoadPtr->getType();
@@ -72,6 +73,13 @@ struct StoreToLoadForwardingCandidate {
            LoadType == StorePtr->getType()->getPointerElementType() &&
            "Should be a known dependence");
 
+    // Currently we only support accesses with unit stride.  FIXME: we should be
+    // able to handle non unit stirde as well as long as the stride is equal to
+    // the dependence distance.
+    if (isStridedPtr(PSE, LoadPtr, L) != 1 ||
+        isStridedPtr(PSE, LoadPtr, L) != 1)
+      return false;
+
     auto &DL = Load->getParent()->getModule()->getDataLayout();
     unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
 
@@ -83,7 +91,7 @@ struct StoreToLoadForwardingCandidate {
     auto *Dist = cast<SCEVConstant>(
         PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
     const APInt &Val = Dist->getAPInt();
-    return Val.abs() == TypeByteSize;
+    return Val == TypeByteSize;
   }
 
   Value *getLoadPtr() const { return Load->getPointerOperand(); }
@@ -223,8 +231,8 @@ public:
         // so deciding which one forwards is easy.  The later one forwards as
         // long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(PSE) &&
-            OtherCand->isDependenceDistanceOfOne(PSE)) {
+            Cand.isDependenceDistanceOfOne(PSE, L) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -441,7 +449,7 @@ public:
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(PSE))
+      if (!Cand.isDependenceDistanceOfOne(PSE, L))
         continue;
 
       ++NumForwarding;
diff --git a/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll b/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll
new file mode 100644
index 00000000000..43751a8ff60
--- /dev/null
+++ b/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll
@@ -0,0 +1,43 @@
+; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+
+; The accesses to A are independent here but LAA reports it as a loop-carried
+; forward dependence.  Check that we don't perform st->ld forwarding between
+; them.
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     A[i][1] = B[i] + 2;
+;     C[i] = A[i][0] * 2;
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f([2 x i32]* noalias %A, i32* noalias %B, i32* noalias %C, i64 %N) {
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+  %A1idx = getelementptr inbounds [2 x i32], [2 x i32]* %A, i64 %indvars.iv, i32 1
+  %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %A0idx = getelementptr inbounds [2 x i32], [2 x i32]* %A, i64 %indvars.iv, i32 0
+
+  %b = load i32, i32* %Bidx, align 4
+  %a_p1 = add i32 %b, 2
+  store i32 %a_p1, i32* %A1idx, align 4
+
+; CHECK: %a = load i32, i32* %A0idx, align 4
+  %a = load i32, i32* %A0idx, align 4
+; CHECK: %c = mul i32 %a, 2
+  %c = mul i32 %a, 2
+  store i32 %c, i32* %Cidx, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}