6 files changed, 154 insertions, 50 deletions
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index dba0b362fd8..79110a874d6 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -513,8 +513,7 @@ class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
                  const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                 DominatorTree *DT, LoopInfo *LI,
-                 bool SpeculateSymbolicStrides);
+                 DominatorTree *DT, LoopInfo *LI);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles.
@@ -585,11 +584,6 @@ public:
   /// \brief Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// \brief Used to ensure that if the analysis was run with speculating the
-  /// value of symbolic strides, the client queries it with the same assumption.
-  /// Only used in DEBUG build but we don't want NDEBUG-dependent ABI.
-  bool SpeculateSymbolicStrides;
-
   /// \brief Checks existence of store to invariant address inside loop.
   /// If the loop has any store to invariant address, then it returns true,
   /// else returns false.
@@ -715,11 +709,8 @@ public:
 
   /// \brief Query the result of the loop access information for the loop \p L.
   ///
-  /// \p SpeculateSymbolicStrides enables symbolic value speculation.  The
-  /// corresponding run-time checks are collected in LAI::PSE.
-  ///
   /// If there is no cached result available run the analysis.
-  const LoopAccessInfo &getInfo(Loop *L, bool SpeculateSymbolicStrides = false);
+  const LoopAccessInfo &getInfo(Loop *L);
 
   void releaseMemory() override {
     // Invalidate the cache when the pass is freed.
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 69e7ce49cb6..44208408b0f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -65,6 +65,21 @@ static cl::opt<unsigned>
                             "loop-access analysis (default = 100)"),
                    cl::init(100));
 
+/// This enables versioning on the strides of symbolically striding memory
+/// accesses in code like the following.
+///   for (i = 0; i < N; ++i)
+///     A[i * Stride1] += B[i * Stride2] ...
+///
+/// Will be roughly translated to
+///    if (Stride1 == 1 && Stride2 == 1) {
+///      for (i = 0; i < N; i+=4)
+///       A[i:i+3] += ...
+///    } else
+///      ...
+static cl::opt<bool> EnableMemAccessVersioning(
+    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
+    cl::desc("Enable symbolic stride memory access versioning"));
+
 /// \brief Enable store-to-load forwarding conflict detection. This option can
 /// be disabled for correctness testing.
 static cl::opt<bool> EnableForwardingConflictDetection(
@@ -1540,7 +1555,7 @@ void LoopAccessInfo::analyzeLoop() {
         NumLoads++;
         Loads.push_back(Ld);
         DepChecker.addAccess(Ld);
-        if (SpeculateSymbolicStrides)
+        if (EnableMemAccessVersioning)
           collectStridedAccess(Ld);
         continue;
       }
@@ -1564,7 +1579,7 @@ void LoopAccessInfo::analyzeLoop() {
         NumStores++;
         Stores.push_back(St);
         DepChecker.addAccess(St);
-        if (SpeculateSymbolicStrides)
+        if (EnableMemAccessVersioning)
           collectStridedAccess(St);
       }
     } // Next instr.
@@ -1904,11 +1919,9 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const DataLayout &DL,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, LoopInfo *LI,
-                               bool SpeculateSymbolicStrides)
-    : SpeculateSymbolicStrides(SpeculateSymbolicStrides), PSE(*SE, *L),
-      PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL), TLI(TLI),
-      AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
+                               DominatorTree *DT, LoopInfo *LI)
+    : PSE(*SE, *L), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
+      TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
@@ -1955,19 +1968,12 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PSE.print(OS, Depth);
 }
 
-const LoopAccessInfo &
-LoopAccessAnalysis::getInfo(Loop *L, bool SpeculateSymbolicStrides) {
+const LoopAccessInfo &LoopAccessAnalysis::getInfo(Loop *L) {
   auto &LAI = LoopAccessInfoMap[L];
 
-#ifndef NDEBUG
-  assert((!LAI || LAI->SpeculateSymbolicStrides == SpeculateSymbolicStrides) &&
-         "Symbolic strides changed for loop");
-#endif
-
   if (!LAI) {
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI,
-                                            SpeculateSymbolicStrides);
+    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI);
   }
   return *LAI.get();
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 3ac01a75892..b88ca7e717d 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -385,7 +385,7 @@ bool LoopVersioningLICM::legalLoopInstructions() {
         return false;
     }
   // Get LoopAccessInfo from current loop.
-  LAI = &LAA->getInfo(CurLoop, true);
+  LAI = &LAA->getInfo(CurLoop);
   // Check LoopAccessInfo for need of runtime check.
   if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
     DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0c4605ea4b3..954b5938cf5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,21 +130,6 @@ static cl::opt<bool> MaximizeBandwidth(
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
              "will be determined by the smallest type in loop."));
 
-/// This enables versioning on the strides of symbolically striding memory
-/// accesses in code like the following.
-///   for (i = 0; i < N; ++i)
-///     A[i * Stride1] += B[i * Stride2] ...
-///
-/// Will be roughly translated to
-///    if (Stride1 == 1 && Stride2 == 1) {
-///      for (i = 0; i < N; i+=4)
-///       A[i:i+3] += ...
-///    } else
-///      ...
-static cl::opt<bool> EnableMemAccessVersioning(
-    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
-    cl::desc("Enable symbolic stride memory access versioning"));
-
 static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
@@ -4970,7 +4955,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
-  LAI = &LAA->getInfo(TheLoop, EnableMemAccessVersioning);
+  LAI = &LAA->getInfo(TheLoop);
   auto &OptionalReport = LAI->getReport();
   if (OptionalReport)
     emitAnalysis(VectorizationReport(*OptionalReport));
diff --git a/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll b/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll
new file mode 100644
index 00000000000..73d3d19c5dd
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll
@@ -0,0 +1,65 @@
+; RUN: opt -basicaa -loop-distribute -S < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=STRIDE_SPEC
+
+; RUN: opt -basicaa -loop-distribute -S -enable-mem-access-versioning=0 < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=NO_STRIDE_SPEC
+
+; If we don't speculate stride for 1 we can't distribute along the line
+; because we could have a backward dependence:
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;     =======================
+;     C[i] = D[i] * A[stride * i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; ALL-LABEL: @f(
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i64 %stride) {
+entry:
+  br label %for.body
+
+; STRIDE_SPEC: %ident.check = icmp ne i64 %stride, 1
+
+; STRIDE_SPEC: for.body.ldist1:
+; NO_STRIDE_SPEC-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %mul = mul i64 %ind, %stride
+  %arrayidxStridedA = getelementptr inbounds i32, i32* %a, i64 %mul
+  %loadStridedA = load i32, i32* %arrayidxStridedA, align 4
+
+  %mulC = mul i32 %loadD, %loadStridedA
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopLoadElim/symbolic-stride.ll b/llvm/test/Transforms/LoopLoadElim/symbolic-stride.ll
index 3e33b3fc173..7a2d1b6c7e3 100644
--- a/llvm/test/Transforms/LoopLoadElim/symbolic-stride.ll
+++ b/llvm/test/Transforms/LoopLoadElim/symbolic-stride.ll
@@ -1,28 +1,44 @@
-; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -loop-load-elim -S < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=TWO_STRIDE_SPEC
 
-; Forwarding in the presence of symbolic strides is currently not supported:
+; RUN: opt -loop-load-elim -S -enable-mem-access-versioning=0 < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=NO_ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=NO_TWO_STRIDE_SPEC
+
+; RUN: opt -loop-load-elim -S -loop-load-elimination-scev-check-threshold=1 < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=NO_TWO_STRIDE_SPEC
+
+; Forwarding in the presence of symbolic strides:
 ;
 ;   for (unsigned i = 0; i < 100; i++)
 ;     A[i + 1] = A[Stride * i] + B[i];
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK-LABEL: @f(
+; ALL-LABEL: @f(
 define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N,
                i64 %stride) {
+
+; ONE_STRIDE_SPEC: %ident.check = icmp ne i64 %stride, 1
+
 entry:
-; CHECK-NOT: %load_initial = load i32, i32* %A
+; NO_ONE_STRIDE_SPEC-NOT: %load_initial = load i32, i32* %A
+; ONE_STRIDE_SPEC: %load_initial = load i32, i32* %A
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
-; CHECK-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; NO_ONE_STRIDE_SPEC-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; ONE_STRIDE_SPEC: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %mul = mul i64 %indvars.iv, %stride
   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul
   %load = load i32, i32* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
   %load_1 = load i32, i32* %arrayidx2, align 4
-; CHECK-NOT: %add = add i32 %load_1, %store_forwarded
+; NO_ONE_STRIDE_SPEC-NOT: %add = add i32 %load_1, %store_forwarded
+; ONE_STRIDE_SPEC: %add = add i32 %load_1, %store_forwarded
   %add = add i32 %load_1, %load
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
@@ -33,3 +49,44 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; With two symbolic strides:
+;
+;   for (unsigned i = 0; i < 100; i++)
+;     A[Stride2 * (i + 1)] = A[Stride1 * i] + B[i];
+
+; ALL-LABEL: @two_strides(
+define void @two_strides(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N,
+                         i64 %stride.1, i64 %stride.2) {
+
+; TWO_STRIDE_SPEC: %ident.check = icmp ne i64 %stride.2, 1
+; TWO_STRIDE_SPEC: %ident.check1 = icmp ne i64 %stride.1, 1
+; NO_TWO_STRIDE_SPEC-NOT: %ident.check{{.*}} = icmp ne i64 %stride{{.*}}, 1
+
+entry:
+; NO_TWO_STRIDE_SPEC-NOT: %load_initial = load i32, i32* %A
+; TWO_STRIDE_SPEC: %load_initial = load i32, i32* %A
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; NO_TWO_STRIDE_SPEC-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; TWO_STRIDE_SPEC: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %mul = mul i64 %indvars.iv, %stride.1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul
+  %load = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %load_1 = load i32, i32* %arrayidx2, align 4
+; NO_TWO_STRIDE_SPEC-NOT: %add = add i32 %load_1, %store_forwarded
+; TWO_STRIDE_SPEC: %add = add i32 %load_1, %store_forwarded
+  %add = add i32 %load_1, %load
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul.2 = mul i64 %indvars.iv.next, %stride.2
+  %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %mul.2
+  store i32 %add, i32* %arrayidx_next, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}