4 files changed, 506 insertions, 184 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 9b12ba18044..84577dd182a 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -34,6 +34,120 @@
 //   else
 //     deoptimize
 //
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+//   for (int i = b; i != e; i++)
+//     guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+//   if (B(Start)) {
+//     do {
+//       I = PHI(Start, I.INC)
+//       I.INC = I + Step
+//       guard(G(I));
+//     } while (B(I.INC));
+//   }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+//   if (B(Start)) {
+//     do {
+//       I = PHI(Start, I.INC)
+//       I.INC = I + Step
+//       guard(G(Start) && M);
+//     } while (B(I.INC));
+//   }
+//
+// One solution for M is M = forall X . (G(X) && B(X + Step)) => G(X + Step) 
+// 
+// Informal proof that the transformation above is correct:
+//
+//   By the definition of guards we can rewrite the guard condition to:
+//     G(I) && G(Start) && M
+//
+//   Let's prove that for each iteration of the loop:
+//     G(Start) && M => G(I)
+//   And the condition above can be simplified to G(Start) && M.
+// 
+//   Induction base.
+//     G(Start) && M => G(Start)
+//
+//   Induction step. Assuming G(Start) && M => G(I) on the subsequent 
+//   iteration:
+//
+//     B(I + Step) is true because it's the backedge condition.
+//     G(I) is true because the backedge is guarded by this condition.
+//
+//   So M = forall X . (G(X) && B(X + Step)) => G(X + Step) implies
+//   G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// For now the transformation is limited to the following case:
+//   * The loop has a single latch with either ult or slt icmp condition.
+//   * The step of the IV used in the latch condition is 1.
+//   * The IV of the latch condition is the same as the post increment IV of the
+//   guard condition.
+//   * The guard condition is ult.
+//
+// In this case the latch is of the from:
+//   ++i u< latchLimit or ++i s< latchLimit
+// and the guard is of the form:
+//   i u< guardLimit
+//
+// For the unsigned latch comparison case M is:
+//   forall X . X u< guardLimit && (X + 1) u< latchLimit =>
+//      (X + 1) u< guardLimit
+//
+// This is true if latchLimit u<= guardLimit since then
+//   (X + 1) u< latchLimit u<= guardLimit == (X + 1) u< guardLimit.
+//
+// So the widened condition is:
+//   i.start u< guardLimit && latchLimit u<= guardLimit
+//
+// For the signed latch comparison case M is:
+//   forall X . X u< guardLimit && (X + 1) s< latchLimit =>
+//      (X + 1) u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is 
+// if
+//   X == guardLimit - 1
+// (and guardLimit is non-zero, but we won't use this latter fact).
+// If X == guardLimit - 1 then the second half of the antecedent is
+//   guardLimit s< latchLimit
+// and its negation is
+//   latchLimit s<= guardLimit.
+//
+// In other words, if latchLimit s<= guardLimit then:
+// (the ranges below are written in ConstantRange notation, where [A, B) is the
+// set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+//    forall X . X u< guardLimit && (X + 1) s< latchLimit =>  (X + 1) u< guardLimit
+// == forall X . X u< guardLimit && (X + 1) s< guardLimit =>  (X + 1) u< guardLimit
+// == forall X . X in [0, guardLimit) && (X + 1) in [INT_MIN, guardLimit) =>  (X + 1) in [0, guardLimit)
+// == forall X . X in [0, guardLimit) && X in [INT_MAX, guardLimit-1) =>  X in [-1, guardLimit-1)
+// == forall X . X in [0, guardLimit-1) => X in [-1, guardLimit-1)
+// == true
+//
+// So the widened condition is:
+//   i.start u< guardLimit && latchLimit s<= guardLimit
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopPredication.h"
@@ -75,8 +189,16 @@ class LoopPredication {
   Loop *L;
   const DataLayout *DL;
   BasicBlock *Preheader;
+  LoopICmp LatchCheck;
 
-  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
+    return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
+                         ICI->getOperand(1));
+  }
+  Optional<LoopICmp> parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+                                   Value *RHS);
+
+  Optional<LoopICmp> parseLoopLatchICmp();
 
   Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
                      ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
@@ -135,11 +257,8 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
 }
 
 Optional<LoopPredication::LoopICmp>
-LoopPredication::parseLoopICmp(ICmpInst *ICI) {
-  ICmpInst::Predicate Pred = ICI->getPredicate();
-
-  Value *LHS = ICI->getOperand(0);
-  Value *RHS = ICI->getOperand(1);
+LoopPredication::parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+                               Value *RHS) {
   const SCEV *LHSS = SE->getSCEV(LHS);
   if (isa<SCEVCouldNotCompute>(LHSS))
     return None;
@@ -165,6 +284,8 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
                                     IRBuilder<> &Builder,
                                     ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS, Instruction *InsertAt) {
+  // TODO: we can check isLoopEntryGuardedByCond before emitting the check
+ 
   Type *Ty = LHS->getType();
   assert(Ty == RHS->getType() && "expandCheck operands have different types?");
   Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
@@ -181,51 +302,54 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
   DEBUG(ICI->dump());
 
+  // parseLoopStructure guarantees that the latch condition is:
+  //   ++i u< latchLimit or ++i s< latchLimit
+  // We are looking for the range checks of the form:
+  //   i u< guardLimit
   auto RangeCheck = parseLoopICmp(ICI);
   if (!RangeCheck) {
     DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
     return None;
   }
-
-  ICmpInst::Predicate Pred = RangeCheck->Pred;
-  const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
-  const SCEV *RHSS = RangeCheck->Limit;
-
-  auto CanExpand = [this](const SCEV *S) {
-    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
-  };
-  if (!CanExpand(RHSS))
+  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+    DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+                 << ")!\n");
     return None;
-
-  DEBUG(dbgs() << "IndexAR: ");
-  DEBUG(IndexAR->dump());
-
-  bool IsIncreasing = false;
-  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
-    return None;
-
-  // If the predicate is increasing the condition can change from false to true
-  // as the loop progresses, in this case take the value on the first iteration
-  // for the widened check. Otherwise the condition can change from true to
-  // false as the loop progresses, so take the value on the last iteration.
-  const SCEV *NewLHSS = IsIncreasing
-                            ? IndexAR->getStart()
-                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
-  if (NewLHSS == IndexAR) {
-    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+  }
+  auto *RangeCheckIV = RangeCheck->IV;
+  auto *PostIncRangeCheckIV = RangeCheckIV->getPostIncExpr(*SE);
+  if (LatchCheck.IV != PostIncRangeCheckIV) {
+    DEBUG(dbgs() << "Post increment range check IV (" << *PostIncRangeCheckIV
+                 << ") is not the same as latch IV (" << *LatchCheck.IV
+                 << ")!\n");
     return None;
   }
+  assert(RangeCheckIV->getStepRecurrence(*SE)->isOne() && "must be one");
+  const SCEV *Start = RangeCheckIV->getStart();
 
-  DEBUG(dbgs() << "NewLHSS: ");
-  DEBUG(NewLHSS->dump());
+  // Generate the widened condition. See the file header comment for reasoning.
+  // If the latch condition is unsigned:
+  //   i.start u< guardLimit && latchLimit u<= guardLimit
+  // If the latch condition is signed:
+  //   i.start u< guardLimit && latchLimit s<= guardLimit
 
-  if (!CanExpand(NewLHSS))
-    return None;
+  auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
+                                           ? ICmpInst::ICMP_SLE
+                                           : ICmpInst::ICMP_ULE;
 
-  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+  auto CanExpand = [this](const SCEV *S) {
+    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+  };
+  if (!CanExpand(Start) || !CanExpand(LatchCheck.Limit) ||
+      !CanExpand(RangeCheck->Limit))
+    return None;
 
   Instruction *InsertAt = Preheader->getTerminator();
-  return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+  auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck->Pred,
+                                          Start, RangeCheck->Limit, InsertAt);
+  auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred,
+                                 LatchCheck.Limit, RangeCheck->Limit, InsertAt);
+  return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
 }
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
@@ -288,6 +412,59 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   return true;
 }
 
+Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
+  using namespace PatternMatch;
+
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+    return None;
+  }
+
+  ICmpInst::Predicate Pred;
+  Value *LHS, *RHS;
+  BasicBlock *TrueDest, *FalseDest;
+
+  if (!match(LoopLatch->getTerminator(),
+             m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
+                  FalseDest))) {
+    DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+    return None;
+  }
+  assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
+         "One of the latch's destinations must be the header");
+  if (TrueDest != L->getHeader())
+    Pred = ICmpInst::getInversePredicate(Pred);
+
+  auto Result = parseLoopICmp(Pred, LHS, RHS);
+  if (!Result) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  if (Result->Pred != ICmpInst::ICMP_ULT &&
+      Result->Pred != ICmpInst::ICMP_SLT) {
+    DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+                 << ")!\n");
+    return None;
+  }
+
+  // Check affine first, so if it's not we don't try to compute the step
+  // recurrence.
+  if (!Result->IV->isAffine()) {
+    DEBUG(dbgs() << "The induction variable is not affine!\n");
+    return None;
+  }
+
+  auto *Step = Result->IV->getStepRecurrence(*SE);
+  if (!Step->isOne()) {
+    DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+    return None;
+  }
+
+  return Result;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
   L = Loop;
 
@@ -308,6 +485,11 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
   if (!Preheader)
     return false;
 
+  auto LatchCheckOpt = parseLoopLatchICmp();
+  if (!LatchCheckOpt)
+    return false;
+  LatchCheck = *LatchCheckOpt;
+
   // Collect all the guards into a vector and process later, so as not
   // to invalidate the instruction iterator.
   SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/llvm/test/Transforms/LoopPredication/basic.ll b/llvm/test/Transforms/LoopPredication/basic.ll
index 6ce07819cb0..a4b4e742a10 100644
--- a/llvm/test/Transforms/LoopPredication/basic.ll
+++ b/llvm/test/Transforms/LoopPredication/basic.ll
@@ -11,8 +11,9 @@ entry:
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp ule i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
@@ -46,8 +47,9 @@ entry:
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp ule i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
@@ -73,44 +75,35 @@ exit:
   ret i32 %result
 }
 
-
-define i32 @two_range_checks(i32* %array.1, i32 %length.1,
-                             i32* %array.2, i32 %length.2, i32 %n) {
-; CHECK-LABEL: @two_range_checks
+define i32 @signed_loop_0_to_n_ult_check(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @signed_loop_0_to_n_ult_check
 entry:
-  %tmp5 = icmp eq i32 %n, 0
+  %tmp5 = icmp sle i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2}}
-; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2}}
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
 loop:
 ; CHECK: loop:
-; CHECK: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
 ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
   %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
-  %within.bounds.1 = icmp ult i32 %i, %length.1
-  %within.bounds.2 = icmp ult i32 %i, %length.2
-  %within.bounds = and i1 %within.bounds.1, %within.bounds.2
+  %within.bounds = icmp ult i32 %i, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
-  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
-  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
-  %loop.acc.1 = add i32 %loop.acc, %array.1.i
-
-  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
-  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
-  %loop.acc.next = add i32 %loop.acc.1, %array.2.i
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
 
   %i.next = add nuw i32 %i, 1
-  %continue = icmp ult i32 %i.next, %n
+  %continue = icmp slt i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
 
 exit:
@@ -118,52 +111,33 @@ exit:
   ret i32 %result
 }
 
-define i32 @three_range_checks(i32* %array.1, i32 %length.1,
-                               i32* %array.2, i32 %length.2,
-                               i32* %array.3, i32 %length.3, i32 %n) {
-; CHECK-LABEL: @three_range_checks
+define i32 @unsupported_latch_pred_loop_0_to_n(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @unsupported_latch_pred_loop_0_to_n
 entry:
-  %tmp5 = icmp eq i32 %n, 0
+  %tmp5 = icmp sle i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
-; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
-; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
 ; CHECK-NEXT: br label %loop
   br label %loop
 
 loop:
 ; CHECK: loop:
-; CHECK: [[wide_cond_and:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_and]], [[wide_cond_3]]
-; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+; CHECK: %within.bounds = icmp ult i32 %i, %length
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
   %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
-  %within.bounds.1 = icmp ult i32 %i, %length.1
-  %within.bounds.2 = icmp ult i32 %i, %length.2
-  %within.bounds.3 = icmp ult i32 %i, %length.3
-  %within.bounds.1.and.2 = and i1 %within.bounds.1, %within.bounds.2
-  %within.bounds = and i1 %within.bounds.1.and.2, %within.bounds.3
+  %within.bounds = icmp ult i32 %i, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
-  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
-  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
-  %loop.acc.1 = add i32 %loop.acc, %array.1.i
-
-  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
-  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
-  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
-
-  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
-  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
-  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
 
-  %i.next = add nuw i32 %i, 1
-  %continue = icmp ult i32 %i.next, %n
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp ne i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
 
 exit:
@@ -171,56 +145,33 @@ exit:
   ret i32 %result
 }
 
-define i32 @three_guards(i32* %array.1, i32 %length.1,
-                         i32* %array.2, i32 %length.2,
-                         i32* %array.3, i32 %length.3, i32 %n) {
-; CHECK-LABEL: @three_guards
+define i32 @signed_loop_0_to_n_unsupported_iv_step(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @signed_loop_0_to_n_unsupported_iv_step
 entry:
-  %tmp5 = icmp eq i32 %n, 0
+  %tmp5 = icmp sle i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.1
-; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.2
-; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = icmp ult i32 [[max_index]], %length.3
 ; CHECK-NEXT: br label %loop
   br label %loop
 
 loop:
 ; CHECK: loop:
-; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_1]], i32 9) [ "deopt"() ]
-; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_2]], i32 9) [ "deopt"() ]
-; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_3]], i32 9) [ "deopt"() ]
-
+; CHECK: %within.bounds = icmp ult i32 %i, %length
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
   %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
-
-  %within.bounds.1 = icmp ult i32 %i, %length.1
-  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.1, i32 9) [ "deopt"() ]
+  %within.bounds = icmp ult i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
-  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
-  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
-  %loop.acc.1 = add i32 %loop.acc, %array.1.i
-
-  %within.bounds.2 = icmp ult i32 %i, %length.2
-  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.2, i32 9) [ "deopt"() ]
-
-  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
-  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
-  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
-
-  %within.bounds.3 = icmp ult i32 %i, %length.3
-  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.3, i32 9) [ "deopt"() ]
-
-  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
-  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
-  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
 
-  %i.next = add nuw i32 %i, 1
-  %continue = icmp ult i32 %i.next, %n
+  %i.next = add nsw i32 %i, 2
+  %continue = icmp slt i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
 
 exit:
@@ -228,15 +179,17 @@ exit:
   ret i32 %result
 }
 
-define i32 @signed_loop_start_to_n_sge_0_check(i32* %array, i32 %length, i32 %start, i32 %n) {
-; CHECK-LABEL: @signed_loop_start_to_n_sge_0_check
+define i32 @signed_loop_0_to_n_equal_iv_range_check(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @signed_loop_0_to_n_equal_iv_range_check
 entry:
-  %tmp5 = icmp eq i32 %n, 0
+  %tmp5 = icmp sle i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp sge i32 %start, 0
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
@@ -244,8 +197,10 @@ loop:
 ; CHECK: loop:
 ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
-  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
-  %within.bounds = icmp sge i32 %i, 0
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %j = phi i32 [ %j.next, %loop ], [ 0, %loop.preheader ]
+
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
@@ -253,6 +208,7 @@ loop:
   %array.i = load i32, i32* %array.i.ptr, align 4
   %loop.acc.next = add i32 %loop.acc, %array.i
 
+  %j.next = add nsw i32 %j, 1
   %i.next = add nsw i32 %i, 1
   %continue = icmp slt i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
@@ -262,28 +218,26 @@ exit:
   ret i32 %result
 }
 
-define i32 @signed_loop_start_to_n_upper_slt_length_check(i32* %array, i32 %length, i32 %start, i32 %n) {
-; CHECK-LABEL: @signed_loop_start_to_n_upper_slt_length_check
+define i32 @signed_loop_0_to_n_unrelated_iv_range_check(i32* %array, i32 %start, i32 %length, i32 %n) {
+; CHECK-LABEL: @signed_loop_0_to_n_unrelated_iv_range_check
 entry:
   %tmp5 = icmp sle i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[start_1:[^ ]+]] = add i32 %start, 1
-; CHECK-NEXT: [[n_sgt_start_1:[^ ]+]] = icmp sgt i32 %n, [[start_1]]
-; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[n_sgt_start_1]], i32 %n, i32 [[start_1]]
-; CHECK-NEXT: [[max_index:[^ ]+]] = add i32 [[smax]], -1
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp slt i32 [[max_index]], %length
 ; CHECK-NEXT: br label %loop
   br label %loop
 
 loop:
 ; CHECK: loop:
-; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+; CHECK: %within.bounds = icmp ult i32 %j, %length
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
-  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
-  %within.bounds = icmp slt i32 %i, %length
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %j = phi i32 [ %j.next, %loop ], [ %start, %loop.preheader ]
+
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
@@ -291,6 +245,7 @@ loop:
   %array.i = load i32, i32* %array.i.ptr, align 4
   %loop.acc.next = add i32 %loop.acc, %array.i
 
+  %j.next = add nsw i32 %j, 1
   %i.next = add nsw i32 %i, 1
   %continue = icmp slt i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
@@ -300,41 +255,166 @@ exit:
   ret i32 %result
 }
 
-define i32 @signed_loop_start_to_n_both_checks(i32* %array, i32 %length, i32 %start, i32 %n) {
-; CHECK-LABEL: @signed_loop_start_to_n_both_checks
+define i32 @two_range_checks(i32* %array.1, i32 %length.1,
+                             i32* %array.2, i32 %length.2, i32 %n) {
+; CHECK-LABEL: @two_range_checks
 entry:
-  %tmp5 = icmp sle i32 %n, 0
+  %tmp5 = icmp eq i32 %n, 0
   br i1 %tmp5, label %exit, label %loop.preheader
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[lower_check:[^ ]+]] = icmp sge i32 %start, 0
-; CHECK-NEXT: [[start_1:[^ ]+]] = add i32 %start, 1
-; CHECK-NEXT: [[n_sgt_start_1:[^ ]+]] = icmp sgt i32 %n, [[start_1]]
-; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[n_sgt_start_1]], i32 %n, i32 [[start_1]]
-; CHECK-NEXT: [[max_index:[^ ]+]] = add i32 [[smax]], -1
-; CHECK-NEXT: [[upper_check:[^ ]+]] = icmp slt i32 [[max_index]], %length
+; CHECK: [[first_iteration_check_1:[^ ]+]] = icmp ult i32 0, %length.{{1|2}}
+; CHECK-NEXT: [[limit_check_1:[^ ]+]] = icmp ule i32 %n, %length.{{1|2}}
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = and i1 [[first_iteration_check_1]], [[limit_check_1]]
+; CHECK-NEXT: [[first_iteration_check_2:[^ ]+]] = icmp ult i32 0, %length.{{1|2}}
+; CHECK-NEXT: [[limit_check_2:[^ ]+]] = icmp ule i32 %n, %length.{{1|2}}
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = and i1 [[first_iteration_check_2]], [[limit_check_2]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
 loop:
 ; CHECK: loop:
-; CHECK: [[wide_cond:[^ ]+]] = and i1 [[lower_check]], [[upper_check]]
-; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+; CHECK: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
-  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
-  %within.bounds.1 = icmp slt i32 %i, %length
-  %within.bounds.2 = icmp sge i32 %i, 0
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
   %within.bounds = and i1 %within.bounds.1, %within.bounds.2
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
-  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
-  %array.i = load i32, i32* %array.i.ptr, align 4
-  %loop.acc.next = add i32 %loop.acc, %array.i
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
 
-  %i.next = add nsw i32 %i, 1
-  %continue = icmp slt i32 %i.next, %n
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.1, %array.2.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @three_range_checks(i32* %array.1, i32 %length.1,
+                               i32* %array.2, i32 %length.2,
+                               i32* %array.3, i32 %length.3, i32 %n) {
+; CHECK-LABEL: @three_range_checks
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[first_iteration_check_1:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_1:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = and i1 [[first_iteration_check_1]], [[limit_check_1]]
+; CHECK-NEXT: [[first_iteration_check_2:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_2:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = and i1 [[first_iteration_check_2]], [[limit_check_2]]
+; CHECK-NEXT: [[first_iteration_check_3:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_3:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = and i1 [[first_iteration_check_3]], [[limit_check_3]]
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: [[wide_cond_and:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_and]], [[wide_cond_3]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  %within.bounds.3 = icmp ult i32 %i, %length.3
+  %within.bounds.1.and.2 = and i1 %within.bounds.1, %within.bounds.2
+  %within.bounds = and i1 %within.bounds.1.and.2, %within.bounds.3
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
+
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @three_guards(i32* %array.1, i32 %length.1,
+                         i32* %array.2, i32 %length.2,
+                         i32* %array.3, i32 %length.3, i32 %n) {
+; CHECK-LABEL: @three_guards
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[first_iteration_check_1:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_1:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = and i1 [[first_iteration_check_1]], [[limit_check_1]]
+; CHECK-NEXT: [[first_iteration_check_2:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_2:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = and i1 [[first_iteration_check_2]], [[limit_check_2]]
+; CHECK-NEXT: [[first_iteration_check_3:[^ ]+]] = icmp ult i32 0, %length.{{1|2|3}}
+; CHECK-NEXT: [[limit_check_3:[^ ]+]] = icmp ule i32 %n, %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = and i1 [[first_iteration_check_3]], [[limit_check_3]]
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_1]], i32 9) [ "deopt"() ]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_2]], i32 9) [ "deopt"() ]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_3]], i32 9) [ "deopt"() ]
+
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.1, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.2, i32 9) [ "deopt"() ]
+
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
+
+  %within.bounds.3 = icmp ult i32 %i, %length.3
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.3, i32 9) [ "deopt"() ]
+
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
   br i1 %continue, label %loop, label %exit
 
 exit:
@@ -350,8 +430,9 @@ entry:
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp ule i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
@@ -439,12 +520,12 @@ loop.preheader:
 loop:
 ; CHECK: loop:
 ; CHECK: %bound = add i32 %i, %x
-; CHECK-NEXT: %within.bounds = icmp slt i32 %i, %bound
+; CHECK-NEXT: %within.bounds = icmp ult i32 %i, %bound
 ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
   %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
   %bound = add i32 %i, %x
-  %within.bounds = icmp slt i32 %i, %bound
+  %within.bounds = icmp ult i32 %i, %bound
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
 
   %i.i64 = zext i32 %i to i64
@@ -503,9 +584,10 @@ entry:
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[length:[^ ]+]] = zext i16 %length.i16 to i32
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], [[length]]
+; CHECK: [[length:[^ ]+]] = zext i16 %length.i16 to i32
+; CHECK-NEXT: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, [[length]]
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp ule i32 %n, [[length]]
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopPredication/nested.ll b/llvm/test/Transforms/LoopPredication/nested.ll
index 6b40cde3e57..796839feec8 100644
--- a/llvm/test/Transforms/LoopPredication/nested.ll
+++ b/llvm/test/Transforms/LoopPredication/nested.ll
@@ -10,8 +10,6 @@ entry:
   br i1 %tmp5, label %exit, label %outer.loop.preheader
 
 outer.loop.preheader:
-; CHECK: outer.loop.preheader:
-; CHECK: [[iteration_count:[^ ]+]] = add i32 %l, -1
   br label %outer.loop
 
 outer.loop:
@@ -22,7 +20,10 @@ outer.loop:
   
 inner.loop.preheader:
 ; CHECK: inner.loop.preheader:
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %l, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
+; CHECK-NEXT: br label %inner.loop
   br label %inner.loop
 
 inner.loop:
@@ -31,7 +32,7 @@ inner.loop:
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %j, %length
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %j.i64 = zext i32 %j to i64
@@ -62,8 +63,10 @@ entry:
 
 outer.loop.preheader:
 ; CHECK: outer.loop.preheader:
-; CHECK: [[iteration_count:[^ ]+]] = add i32 %n, -1
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
+; CHECK-NEXT: br label %outer.loop
   br label %outer.loop
 
 outer.loop:
@@ -82,7 +85,7 @@ inner.loop:
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %i, %length
+  %within.bounds = icmp ult i32 %i, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %i.i64 = zext i32 %i to i64
@@ -112,14 +115,15 @@ entry:
   br i1 %tmp5, label %exit, label %outer.loop.preheader
 
 outer.loop.preheader:
+; CHECK: outer.loop.preheader:
+; CHECK-NEXT: [[first_iteration_check_outer:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check_outer:[^ ]+]] = icmp sle i32 %n, %length
+; CHECK-NEXT: [[wide_cond_outer:[^ ]+]] = and i1 [[first_iteration_check_outer]], [[limit_check_outer]]
+; CHECK-NEXT: br label %outer.loop
   br label %outer.loop
 
 outer.loop:
 ; CHECK: outer.loop:
-; CHECK: [[i_1:[^ ]+]] = add i32 %i, 1
-; CHECK-NEXT: [[l_sgt_i_1:[^ ]+]] = icmp sgt i32 %l, [[i_1]]
-; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[l_sgt_i_1]], i32 %l, i32 [[i_1]]
-; CHECK-NEXT: [[max_j:[^ ]+]] = add i32 [[smax]], -1
   %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
   %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
   %tmp6 = icmp sle i32 %l, 0
@@ -127,16 +131,69 @@ outer.loop:
   
 inner.loop.preheader:
 ; CHECK: inner.loop.preheader:
-; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[max_j]], %length
+; CHECK: [[limit_check_inner:[^ ]+]] = icmp sle i32 %l, %length
+; CHECK: br label %inner.loop
   br label %inner.loop
 
 inner.loop:
 ; CHECK: inner.loop:
+; CHECK: [[wide_cond:[^ ]+]] = and i1 [[limit_check_inner]], [[wide_cond_outer]]
 ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
   %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
   %j = phi i32 [ %j.next, %inner.loop ], [ %i, %inner.loop.preheader ]
 
-  %within.bounds = icmp slt i32 %j, %length
+  %within.bounds = icmp ult i32 %j, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  
+  %j.i64 = zext i32 %j to i64
+  %array.j.ptr = getelementptr inbounds i32, i32* %array, i64 %j.i64
+  %array.j = load i32, i32* %array.j.ptr, align 4
+  %inner.loop.acc.next = add i32 %inner.loop.acc, %array.j
+
+  %j.next = add nsw i32 %j, 1
+  %inner.continue = icmp slt i32 %j.next, %l
+  br i1 %inner.continue, label %inner.loop, label %outer.loop.inc
+
+outer.loop.inc:
+  %outer.loop.acc.next = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %outer.loop ]
+  %i.next = add nsw i32 %i, 1
+  %outer.continue = icmp slt i32 %i.next, %n
+  br i1 %outer.continue, label %outer.loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %outer.loop.acc.next, %outer.loop.inc ]
+  ret i32 %result
+}
+
+define i32 @cant_expand_guard_check_start(i32* %array, i32 %length, i32 %n, i32 %l, i32 %maybezero) {
+; CHECK-LABEL: @cant_expand_guard_check_start
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %outer.loop.preheader
+
+outer.loop.preheader:
+  br label %outer.loop
+
+outer.loop:
+  %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %tmp6 = icmp sle i32 %l, 0
+  %div = udiv i32 %i, %maybezero
+  br i1 %tmp6, label %outer.loop.inc, label %inner.loop.preheader
+  
+inner.loop.preheader:
+; CHECK: inner.loop.preheader:
+; CHECK: br label %inner.loop
+  br label %inner.loop
+
+inner.loop:
+; CHECK: inner.loop:
+; CHECK: %within.bounds = icmp ult i32 %j, %length
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
+  %j = phi i32 [ %j.next, %inner.loop ], [ %div, %inner.loop.preheader ]
+
+  %within.bounds = icmp ult i32 %j, %length
   call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
   
   %j.i64 = zext i32 %j to i64
diff --git a/llvm/test/Transforms/LoopPredication/visited.ll b/llvm/test/Transforms/LoopPredication/visited.ll
index e9aae77f8e6..01feaeabd16 100644
--- a/llvm/test/Transforms/LoopPredication/visited.ll
+++ b/llvm/test/Transforms/LoopPredication/visited.ll
@@ -11,8 +11,9 @@ entry:
 
 loop.preheader:
 ; CHECK: loop.preheader:
-; CHECK: [[iteration_count:[^ ]+]] = add i32 %n, -1
-; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[iteration_count]], %length
+; CHECK: [[first_iteration_check:[^ ]+]] = icmp ult i32 0, %length
+; CHECK-NEXT: [[limit_check:[^ ]+]] = icmp ule i32 %n, %length
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[first_iteration_check]], [[limit_check]]
 ; CHECK-NEXT: br label %loop
   br label %loop