4 files changed, 259 insertions, 93 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index b10ec265891..97894726637 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -543,10 +543,6 @@ struct LoopInterchange : public FunctionPass {
     printDepMatrix(DependencyMatrix);
 #endif
 
-    // Since we currently do not handle LCSSA PHI's any failure in loop
-    // condition will now branch to LoopNestExit.
-    // TODO: This should be removed once we handle LCSSA PHI nodes.
-
     // Get the Outermost loop exit.
     BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
     if (!LoopNestExit) {
@@ -554,12 +550,6 @@ struct LoopInterchange : public FunctionPass {
       return false;
     }
 
-    if (isa<PHINode>(LoopNestExit->begin())) {
-      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
-                      "since on failure all loops branch to loop nest exit.\n");
-      return false;
-    }
-
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
     // Move the selected loop outwards to the best possible position.
     for (unsigned i = SelecLoopId; i > 0; i--) {
@@ -862,19 +852,6 @@ bool LoopInterchangeLegality::currentLimitations() {
   }
 
   // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
-  BasicBlock *OuterExit = OuterLoop->getExitBlock();
-  if (!containsSafePHI(OuterExit, true)) {
-    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Only outer loops with LCSSA PHIs can be interchange "
-                "currently.";
-    });
-    return true;
-  }
-
   BasicBlock *InnerExit = InnerLoop->getExitBlock();
   if (!containsSafePHI(InnerExit, false)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
@@ -962,6 +939,43 @@ bool LoopInterchangeLegality::currentLimitations() {
   return false;
 }
 
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+  for (PHINode &PHI : LoopNestExit->phis()) {
+    //  FIXME: We currently are not able to detect floating point reductions
+    //         and have to use floating point PHIs as a proxy to prevent
+    //         interchanging in the presence of floating point reductions.
+    if (PHI.getType()->isFloatingPointTy())
+      return false;
+    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+       continue;
+
+     // The incoming value is defined in the outer loop latch. Currently we
+     // only support that in case the outer loop latch has a single predecessor.
+     // This guarantees that the outer loop latch is executed if and only if
+     // the inner loop is executed (because tightlyNested() guarantees that the
+     // outer loop header only branches to the inner loop or the outer loop
+     // latch).
+     // FIXME: We could weaken this logic and allow multiple predecessors,
+     //        if the values are produced outside the loop latch. We would need
+     //        additional logic to update the PHI nodes in the exit block as
+     //        well.
+     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+       return false;
+    }
+  }
+  return true;
+}
+
 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
                                                   unsigned OuterLoopId,
                                                   CharMatrix &DepMatrix) {
@@ -1038,6 +1052,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     return false;
   }
 
+  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+    DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
   return true;
 }
 
diff --git a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
deleted file mode 100644
index b46cbb1f153..00000000000
--- a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -basicaa -loop-interchange -S -debug 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = common global [100 x [100 x i32]] zeroinitializer
-@C = common global [100 x [100 x i32]] zeroinitializer
-
-;; FIXME:
-;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
-;;     for(gi=1;gi<N;gi++)
-;;       for(gj=1;gj<M;gj++)
-;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
-
-; CHECK: PHI Nodes in loop nest exit is not handled for now since on failure all loops branch to loop nest exit.
-
-@gi = common global i32 0
-@gj = common global i32 0
-
-define void @interchange_07(i32 %N, i32 %M){
-entry:
-  store i32 1, i32* @gi
-  %cmp21 = icmp sgt i32 %N, 1
-  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
-
-for.cond1.preheader.lr.ph:
-  %cmp218 = icmp sgt i32 %M, 1
-  %gi.promoted = load i32, i32* @gi
-  %0 = add i32 %M, -1
-  %1 = sext i32 %gi.promoted to i64
-  %2 = sext i32 %N to i64
-  %3 = add i32 %gi.promoted, 1
-  %4 = icmp slt i32 %3, %N
-  %smax = select i1 %4, i32 %N, i32 %3
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
-  br i1 %cmp218, label %for.body3, label %for.inc14
-
-for.body3:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
-  %5 = add nsw i64 %indvars.iv, -1
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
-  %6 = load i32, i32* %arrayidx5
-  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  %7 = load i32, i32* %arrayidx9
-  %add = add nsw i32 %7, %6
-  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  store i32 %add, i32* %arrayidx13
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc14, label %for.body3
-
-for.inc14:
-  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
-  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
-  %cmp = icmp slt i64 %indvars.iv.next26, %2
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
-
-for.cond.for.end16_crit_edge:
-  store i32 %inc.lcssa23, i32* @gj
-  store i32 %smax, i32* @gi
-  br label %for.end16
-
-for.end16:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll
new file mode 100644
index 00000000000..b44c2404464
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t
+; RUN: cat %t |  FileCheck --check-prefix REMARK %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@X = common global i32 0
+@Y = common global i64 0
+@F = common global float 0.0
+
+; We cannot interchange this loop at the moment, because iv.outer.next is
+; produced in the outer loop latch and used in the loop exit block. If the inner
+; loop body is not executed, the outer loop latch won't be executed either
+; after interchanging.
+; REMARK: UnsupportedExitPHI
+; REMARK-NEXT: lcssa_01
+
+define void @lcssa_01(){
+entry:
+  %cmp21 = icmp sgt i64 100, 1
+  br i1 %cmp21, label %outer.ph, label %for.end16
+
+outer.ph:
+  %cmp218 = icmp sgt i64 100, 1
+  br label %outer.header
+
+outer.header:
+  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+  br i1 %cmp218, label %for.body3, label %outer.inc
+
+for.body3:
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %arrayidx5
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+  store i64 %iv.outer.next, i64 * @Y
+  br label %for.end16
+
+for.end16:
+  ret void
+}
+
+; REMARK: UnsupportedExitPHI
+; REMARK-NEXT: lcssa_02
+define void @lcssa_02(){
+entry:
+  %cmp21 = icmp sgt i64 100, 1
+  br i1 %cmp21, label %outer.ph, label %for.end16
+
+outer.ph:
+  %cmp218 = icmp sgt i64 100, 1
+  br label %outer.header
+
+outer.header:
+  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+  br i1 %cmp218, label %for.body3, label %outer.inc
+
+for.body3:
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %arrayidx5
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+  %iv.inner.end = phi i64 [ 0, %outer.header ], [ %iv.inner.next, %for.body3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+  store i64 %iv.inner.end, i64 * @Y
+  br label %for.end16
+
+for.end16:
+  ret void
+}
+
+
+; REMARK: Interchanged
+; REMARK-NEXT: lcssa_03
+define void @lcssa_03(){
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  br label %for.body3
+
+for.body3:
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %arrayidx5
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+  store i64 %iv.inner, i64 * @Y
+  br label %for.end16
+
+for.end16:
+  ret void
+}
+
+; FIXME: We currently do not support LCSSA phi nodes involving floating point
+;        types, as we fail to detect floating point reductions for now.
+; REMARK: UnsupportedPHIOuter
+; REMARK-NEXT: lcssa_04
+define void @lcssa_04(){
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer= phi float [ 1.0, %entry ], [ 2.0, %outer.inc ]
+  br label %for.body3
+
+for.body3:
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vC = load i32, i32* %arrayidx9
+  %add = add nsw i32 %vA, %vC
+  store i32 %add, i32* %arrayidx5
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+  store float %float.outer, float* @F
+  br label %for.end16
+
+for.end16:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/reductions.ll b/llvm/test/Transforms/LoopInterchange/reductions.ll
index ca37f7b271b..da92276feb4 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions.ll
@@ -220,3 +220,43 @@ for.inc13:                                        ; preds = %for.cond4.for.inc10
 for.end15:                                        ; preds = %for.inc13, %entry
   ret void
 }
+
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++)
+;;      X+=A[j][i];
+;;  Y = X
+; CHECK: Loops interchanged.
+define void @reduction_05(i32 %N) {
+entry:
+  %cmp16 = icmp sgt i32 %N, 1
+  br i1 %cmp16, label %for.body7.lr.ph, label %for.end8
+
+for.body7.lr.ph:                                  ; preds = %entry, %for.cond1.for.inc6_crit_edge
+  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body7
+
+for.body7:                                        ; preds = %for.body7, %for.body7.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ]
+  %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ]
+  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %add15, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7
+
+for.cond1.for.inc6_crit_edge:                     ; preds = %for.body7
+  store i32 %add, i32* @X
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+  br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph
+
+for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  %add.res = phi i32 [ %add, %for.cond1.for.inc6_crit_edge], [ 0, %entry ]
+  store i32 %add.res, i32* @Y
+
+  ret void
+}