diff options
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 71 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll | 70 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopInterchange/lcssa.ll | 171 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopInterchange/reductions.ll | 40 |
4 files changed, 259 insertions, 93 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index b10ec265891..97894726637 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -543,10 +543,6 @@ struct LoopInterchange : public FunctionPass { printDepMatrix(DependencyMatrix); #endif - // Since we currently do not handle LCSSA PHI's any failure in loop - // condition will now branch to LoopNestExit. - // TODO: This should be removed once we handle LCSSA PHI nodes. - // Get the Outermost loop exit. BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); if (!LoopNestExit) { @@ -554,12 +550,6 @@ struct LoopInterchange : public FunctionPass { return false; } - if (isa<PHINode>(LoopNestExit->begin())) { - DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " - "since on failure all loops branch to loop nest exit.\n"); - return false; - } - unsigned SelecLoopId = selectLoopForInterchange(LoopList); // Move the selected loop outwards to the best possible position. for (unsigned i = SelecLoopId; i > 0; i--) { @@ -862,19 +852,6 @@ bool LoopInterchangeLegality::currentLimitations() { } // TODO: We only handle LCSSA PHI's corresponding to reduction for now. - BasicBlock *OuterExit = OuterLoop->getExitBlock(); - if (!containsSafePHI(OuterExit, true)) { - DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with LCSSA PHIs can be interchange " - "currently."; - }); - return true; - } - BasicBlock *InnerExit = InnerLoop->getExitBlock(); if (!containsSafePHI(InnerExit, false)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); @@ -962,6 +939,43 @@ bool LoopInterchangeLegality::currentLimitations() { return false; } +// We currently support LCSSA PHI nodes in the outer loop exit, if their +// incoming values do not come from the outer loop latch or if the +// outer loop latch has a single predecessor. In that case, the value will +// be available if both the inner and outer loop conditions are true, which +// will still be true after interchanging. If we have multiple predecessor, +// that may not be the case, e.g. because the outer loop latch may be executed +// if the inner loop is not executed. +static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); + for (PHINode &PHI : LoopNestExit->phis()) { + // FIXME: We currently are not able to detect floating point reductions + // and have to use floating point PHIs as a proxy to prevent + // interchanging in the presence of floating point reductions. + if (PHI.getType()->isFloatingPointTy()) + return false; + for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { + Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; + + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; + } + } + return true; +} + bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { @@ -1038,6 +1052,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } + if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Found unsupported PHI node in loop exit."; + }); + return false; + } + return true; } diff --git a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll deleted file mode 100644 index b46cbb1f153..00000000000 --- a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll +++ /dev/null @@ -1,70 +0,0 @@ -; REQUIRES: asserts -; RUN: opt < %s -basicaa -loop-interchange -S -debug 2>&1 | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@A = common global [100 x [100 x i32]] zeroinitializer -@C = common global [100 x [100 x i32]] zeroinitializer - -;; FIXME: -;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported. -;; for(gi=1;gi<N;gi++) -;; for(gj=1;gj<M;gj++) -;; A[gj][gi] = A[gj - 1][gi] + C[gj][gi]; - -; CHECK: PHI Nodes in loop nest exit is not handled for now since on failure all loops branch to loop nest exit. - -@gi = common global i32 0 -@gj = common global i32 0 - -define void @interchange_07(i32 %N, i32 %M){ -entry: - store i32 1, i32* @gi - %cmp21 = icmp sgt i32 %N, 1 - br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16 - -for.cond1.preheader.lr.ph: - %cmp218 = icmp sgt i32 %M, 1 - %gi.promoted = load i32, i32* @gi - %0 = add i32 %M, -1 - %1 = sext i32 %gi.promoted to i64 - %2 = sext i32 %N to i64 - %3 = add i32 %gi.promoted, 1 - %4 = icmp slt i32 %3, %N - %smax = select i1 %4, i32 %N, i32 %3 - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ] - br i1 %cmp218, label %for.body3, label %for.inc14 - -for.body3: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] - %5 = add nsw i64 %indvars.iv, -1 - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 - %6 = load i32, i32* %arrayidx5 - %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 - %7 = load i32, i32* %arrayidx9 - %add = add nsw i32 %7, %6 - %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25 - store i32 %add, i32* %arrayidx13 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %0 - br i1 %exitcond, label %for.inc14, label %for.body3 - -for.inc14: - %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ] - %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1 - %cmp = icmp slt i64 %indvars.iv.next26, %2 - br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge - -for.cond.for.end16_crit_edge: - store i32 %inc.lcssa23, i32* @gj - store i32 %smax, i32* @gi - br label %for.end16 - -for.end16: - ret void -} diff --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll new file mode 100644 index 00000000000..b44c2404464 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -0,0 +1,171 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t +; RUN: cat %t | FileCheck --check-prefix REMARK %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer +@X = common global i32 0 +@Y = common global i64 0 +@F = common global float 0.0 + +; We cannot interchange this loop at the moment, because iv.outer.next is +; produced in the outer loop latch and used in the loop exit block. If the inner +; loop body is not executed, the outer loop latch won't be executed either +; after interchanging. +; REMARK: UnsupportedExitPHI +; REMARK-NEXT: lcssa_01 + +define void @lcssa_01(){ +entry: + %cmp21 = icmp sgt i64 100, 1 + br i1 %cmp21, label %outer.ph, label %for.end16 + +outer.ph: + %cmp218 = icmp sgt i64 100, 1 + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ] + br i1 %cmp218, label %for.body3, label %outer.inc + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %arrayidx5 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %iv.outer.next, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} + +; REMARK: UnsupportedExitPHI +; REMARK-NEXT: lcssa_02 +define void @lcssa_02(){ +entry: + %cmp21 = icmp sgt i64 100, 1 + br i1 %cmp21, label %outer.ph, label %for.end16 + +outer.ph: + %cmp218 = icmp sgt i64 100, 1 + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ] + br i1 %cmp218, label %for.body3, label %outer.inc + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %arrayidx5 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %iv.inner.end = phi i64 [ 0, %outer.header ], [ %iv.inner.next, %for.body3 ] + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %iv.inner.end, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} + + +; REMARK: Interchanged +; REMARK-NEXT: lcssa_03 +define void @lcssa_03(){ +entry: + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + br label %for.body3 + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %arrayidx5 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %iv.inner, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} + +; FIXME: We currently do not support LCSSA phi nodes involving floating point +; types, as we fail to detect floating point reductions for now. +; REMARK: UnsupportedPHIOuter +; REMARK-NEXT: lcssa_04 +define void @lcssa_04(){ +entry: + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + %float.outer= phi float [ 1.0, %entry ], [ 2.0, %outer.inc ] + br label %for.body3 + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %arrayidx5 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store float %float.outer, float* @F + br label %for.end16 + +for.end16: + ret void +} diff --git a/llvm/test/Transforms/LoopInterchange/reductions.ll b/llvm/test/Transforms/LoopInterchange/reductions.ll index ca37f7b271b..da92276feb4 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions.ll @@ -220,3 +220,43 @@ for.inc13: ; preds = %for.cond4.for.inc10 for.end15: ; preds = %for.inc13, %entry ret void } + +;; for( int i=1;i<N;i++) +;; for( int j=1;j<N;j++) +;; X+=A[j][i]; +;; Y = X +; CHECK: Loops interchanged. +define void @reduction_05(i32 %N) { +entry: + %cmp16 = icmp sgt i32 %N, 1 + br i1 %cmp16, label %for.body7.lr.ph, label %for.end8 + +for.body7.lr.ph: ; preds = %entry, %for.cond1.for.inc6_crit_edge + %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ] + %X.promoted = load i32, i32* @X + br label %for.body7 + +for.body7: ; preds = %for.body7, %for.body7.lr.ph + %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ] + %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ] + %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18 + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %add15, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7 + +for.cond1.for.inc6_crit_edge: ; preds = %for.body7 + store i32 %add, i32* @X + %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1 + %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32 + %exitcond21 = icmp eq i32 %lftr.wideiv20, %N + br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph + +for.end8: ; preds = %for.cond1.for.inc6_crit_edge, %entry + %add.res = phi i32 [ %add, %for.cond1.for.inc6_crit_edge], [ 0, %entry ] + store i32 %add.res, i32* @Y + + ret void +} |