diff options
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 71 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll | 70 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopInterchange/lcssa.ll | 171 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopInterchange/reductions.ll | 40 | 
4 files changed, 259 insertions, 93 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index b10ec265891..97894726637 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -543,10 +543,6 @@ struct LoopInterchange : public FunctionPass {      printDepMatrix(DependencyMatrix);  #endif -    // Since we currently do not handle LCSSA PHI's any failure in loop -    // condition will now branch to LoopNestExit. -    // TODO: This should be removed once we handle LCSSA PHI nodes. -      // Get the Outermost loop exit.      BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();      if (!LoopNestExit) { @@ -554,12 +550,6 @@ struct LoopInterchange : public FunctionPass {        return false;      } -    if (isa<PHINode>(LoopNestExit->begin())) { -      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " -                      "since on failure all loops branch to loop nest exit.\n"); -      return false; -    } -      unsigned SelecLoopId = selectLoopForInterchange(LoopList);      // Move the selected loop outwards to the best possible position.      for (unsigned i = SelecLoopId; i > 0; i--) { @@ -862,19 +852,6 @@ bool LoopInterchangeLegality::currentLimitations() {    }    // TODO: We only handle LCSSA PHI's corresponding to reduction for now. -  BasicBlock *OuterExit = OuterLoop->getExitBlock(); -  if (!containsSafePHI(OuterExit, true)) { -    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); -    ORE->emit([&]() { -      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", -                                      OuterLoop->getStartLoc(), -                                      OuterLoop->getHeader()) -             << "Only outer loops with LCSSA PHIs can be interchange " -                "currently."; -    }); -    return true; -  } -    BasicBlock *InnerExit = InnerLoop->getExitBlock();    if (!containsSafePHI(InnerExit, false)) {      DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); @@ -962,6 +939,43 @@ bool LoopInterchangeLegality::currentLimitations() {    return false;  } +// We currently support LCSSA PHI nodes in the outer loop exit, if their +// incoming values do not come from the outer loop latch or if the +// outer loop latch has a single predecessor. In that case, the value will +// be available if both the inner and outer loop conditions are true, which +// will still be true after interchanging. If we have multiple predecessor, +// that may not be the case, e.g. because the outer loop latch may be executed +// if the inner loop is not executed. +static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { +  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); +  for (PHINode &PHI : LoopNestExit->phis()) { +    //  FIXME: We currently are not able to detect floating point reductions +    //         and have to use floating point PHIs as a proxy to prevent +    //         interchanging in the presence of floating point reductions. +    if (PHI.getType()->isFloatingPointTy()) +      return false; +    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { +     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); +     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) +       continue; + +     // The incoming value is defined in the outer loop latch. Currently we +     // only support that in case the outer loop latch has a single predecessor. +     // This guarantees that the outer loop latch is executed if and only if +     // the inner loop is executed (because tightlyNested() guarantees that the +     // outer loop header only branches to the inner loop or the outer loop +     // latch). +     // FIXME: We could weaken this logic and allow multiple predecessors, +     //        if the values are produced outside the loop latch. We would need +     //        additional logic to update the PHI nodes in the exit block as +     //        well. +     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) +       return false; +    } +  } +  return true; +} +  bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,                                                    unsigned OuterLoopId,                                                    CharMatrix &DepMatrix) { @@ -1038,6 +1052,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,      return false;    } +  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { +    DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); +    ORE->emit([&]() { +      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", +                                      OuterLoop->getStartLoc(), +                                      OuterLoop->getHeader()) +             << "Found unsupported PHI node in loop exit."; +    }); +    return false; +  } +    return true;  } diff --git a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll deleted file mode 100644 index b46cbb1f153..00000000000 --- a/llvm/test/Transforms/LoopInterchange/current-limitations-lcssa.ll +++ /dev/null @@ -1,70 +0,0 @@ -; REQUIRES: asserts -; RUN: opt < %s -basicaa -loop-interchange -S -debug 2>&1 | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@A = common global [100 x [100 x i32]] zeroinitializer -@C = common global [100 x [100 x i32]] zeroinitializer - -;; FIXME: -;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported. -;;     for(gi=1;gi<N;gi++) -;;       for(gj=1;gj<M;gj++) -;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi]; - -; CHECK: PHI Nodes in loop nest exit is not handled for now since on failure all loops branch to loop nest exit. - -@gi = common global i32 0 -@gj = common global i32 0 - -define void @interchange_07(i32 %N, i32 %M){ -entry: -  store i32 1, i32* @gi -  %cmp21 = icmp sgt i32 %N, 1 -  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16 - -for.cond1.preheader.lr.ph: -  %cmp218 = icmp sgt i32 %M, 1 -  %gi.promoted = load i32, i32* @gi -  %0 = add i32 %M, -1 -  %1 = sext i32 %gi.promoted to i64 -  %2 = sext i32 %N to i64 -  %3 = add i32 %gi.promoted, 1 -  %4 = icmp slt i32 %3, %N -  %smax = select i1 %4, i32 %N, i32 %3 -  br label %for.cond1.preheader - -for.cond1.preheader: -  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ] -  br i1 %cmp218, label %for.body3, label %for.inc14 - -for.body3: -  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] -  %5 = add nsw i64 %indvars.iv, -1 -  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 -  %6 = load i32, i32* %arrayidx5 -  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 -  %7 = load i32, i32* %arrayidx9 -  %add = add nsw i32 %7, %6 -  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25 -  store i32 %add, i32* %arrayidx13 -  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -  %lftr.wideiv = trunc i64 %indvars.iv to i32 -  %exitcond = icmp eq i32 %lftr.wideiv, %0 -  br i1 %exitcond, label %for.inc14, label %for.body3 - -for.inc14: -  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ] -  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1 -  %cmp = icmp slt i64 %indvars.iv.next26, %2 -  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge - -for.cond.for.end16_crit_edge: -  store i32 %inc.lcssa23, i32* @gj -  store i32 %smax, i32* @gi -  br label %for.end16 - -for.end16: -  ret void -} diff --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll new file mode 100644 index 00000000000..b44c2404464 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -0,0 +1,171 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t +; RUN: cat %t |  FileCheck --check-prefix REMARK %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer +@X = common global i32 0 +@Y = common global i64 0 +@F = common global float 0.0 + +; We cannot interchange this loop at the moment, because iv.outer.next is +; produced in the outer loop latch and used in the loop exit block. If the inner +; loop body is not executed, the outer loop latch won't be executed either +; after interchanging. +; REMARK: UnsupportedExitPHI +; REMARK-NEXT: lcssa_01 + +define void @lcssa_01(){ +entry: +  %cmp21 = icmp sgt i64 100, 1 +  br i1 %cmp21, label %outer.ph, label %for.end16 + +outer.ph: +  %cmp218 = icmp sgt i64 100, 1 +  br label %outer.header + +outer.header: +  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ] +  br i1 %cmp218, label %for.body3, label %outer.inc + +for.body3: +  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] +  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer +  %vA = load i32, i32* %arrayidx5 +  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer +  %vC = load i32, i32* %arrayidx9 +  %add = add nsw i32 %vA, %vC +  store i32 %add, i32* %arrayidx5 +  %iv.inner.next = add nuw nsw i64 %iv.inner, 1 +  %exitcond = icmp eq i64 %iv.inner.next, 100 +  br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: +  %iv.outer.next = add nsw i64 %iv.outer, 1 +  %cmp = icmp eq i64 %iv.outer.next, 100 +  br i1 %cmp, label %outer.header, label %for.exit + +for.exit: +  store i64 %iv.outer.next, i64 * @Y +  br label %for.end16 + +for.end16: +  ret void +} + +; REMARK: UnsupportedExitPHI +; REMARK-NEXT: lcssa_02 +define void @lcssa_02(){ +entry: +  %cmp21 = icmp sgt i64 100, 1 +  br i1 %cmp21, label %outer.ph, label %for.end16 + +outer.ph: +  %cmp218 = icmp sgt i64 100, 1 +  br label %outer.header + +outer.header: +  %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ] +  br i1 %cmp218, label %for.body3, label %outer.inc + +for.body3: +  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] +  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer +  %vA = load i32, i32* %arrayidx5 +  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer +  %vC = load i32, i32* %arrayidx9 +  %add = add nsw i32 %vA, %vC +  store i32 %add, i32* %arrayidx5 +  %iv.inner.next = add nuw nsw i64 %iv.inner, 1 +  %exitcond = icmp eq i64 %iv.inner.next, 100 +  br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: +  %iv.inner.end = phi i64 [ 0, %outer.header ], [ %iv.inner.next, %for.body3 ] +  %iv.outer.next = add nsw i64 %iv.outer, 1 +  %cmp = icmp eq i64 %iv.outer.next, 100 +  br i1 %cmp, label %outer.header, label %for.exit + +for.exit: +  store i64 %iv.inner.end, i64 * @Y +  br label %for.end16 + +for.end16: +  ret void +} + + +; REMARK: Interchanged +; REMARK-NEXT: lcssa_03 +define void @lcssa_03(){ +entry: +  br label %outer.header + +outer.header: +  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] +  br label %for.body3 + +for.body3: +  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] +  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer +  %vA = load i32, i32* %arrayidx5 +  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer +  %vC = load i32, i32* %arrayidx9 +  %add = add nsw i32 %vA, %vC +  store i32 %add, i32* %arrayidx5 +  %iv.inner.next = add nuw nsw i64 %iv.inner, 1 +  %exitcond = icmp eq i64 %iv.inner.next, 100 +  br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: +  %iv.outer.next = add nsw i64 %iv.outer, 1 +  %cmp = icmp eq i64 %iv.outer.next, 100 +  br i1 %cmp, label %outer.header, label %for.exit + +for.exit: +  store i64 %iv.inner, i64 * @Y +  br label %for.end16 + +for.end16: +  ret void +} + +; FIXME: We currently do not support LCSSA phi nodes involving floating point +;        types, as we fail to detect floating point reductions for now. +; REMARK: UnsupportedPHIOuter +; REMARK-NEXT: lcssa_04 +define void @lcssa_04(){ +entry: +  br label %outer.header + +outer.header: +  %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] +  %float.outer= phi float [ 1.0, %entry ], [ 2.0, %outer.inc ] +  br label %for.body3 + +for.body3: +  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] +  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer +  %vA = load i32, i32* %arrayidx5 +  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer +  %vC = load i32, i32* %arrayidx9 +  %add = add nsw i32 %vA, %vC +  store i32 %add, i32* %arrayidx5 +  %iv.inner.next = add nuw nsw i64 %iv.inner, 1 +  %exitcond = icmp eq i64 %iv.inner.next, 100 +  br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: +  %iv.outer.next = add nsw i64 %iv.outer, 1 +  %cmp = icmp eq i64 %iv.outer.next, 100 +  br i1 %cmp, label %outer.header, label %for.exit + +for.exit: +  store float %float.outer, float* @F +  br label %for.end16 + +for.end16: +  ret void +} diff --git a/llvm/test/Transforms/LoopInterchange/reductions.ll b/llvm/test/Transforms/LoopInterchange/reductions.ll index ca37f7b271b..da92276feb4 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions.ll @@ -220,3 +220,43 @@ for.inc13:                                        ; preds = %for.cond4.for.inc10  for.end15:                                        ; preds = %for.inc13, %entry    ret void  } + +;;  for( int i=1;i<N;i++) +;;    for( int j=1;j<N;j++) +;;      X+=A[j][i]; +;;  Y = X +; CHECK: Loops interchanged. +define void @reduction_05(i32 %N) { +entry: +  %cmp16 = icmp sgt i32 %N, 1 +  br i1 %cmp16, label %for.body7.lr.ph, label %for.end8 + +for.body7.lr.ph:                                  ; preds = %entry, %for.cond1.for.inc6_crit_edge +  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ] +  %X.promoted = load i32, i32* @X +  br label %for.body7 + +for.body7:                                        ; preds = %for.body7, %for.body7.lr.ph +  %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ] +  %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ] +  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18 +  %0 = load i32, i32* %arrayidx5 +  %add = add nsw i32 %add15, %0 +  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +  %lftr.wideiv = trunc i64 %indvars.iv.next to i32 +  %exitcond = icmp eq i32 %lftr.wideiv, %N +  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7 + +for.cond1.for.inc6_crit_edge:                     ; preds = %for.body7 +  store i32 %add, i32* @X +  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1 +  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32 +  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N +  br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph + +for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry +  %add.res = phi i32 [ %add, %for.cond1.for.inc6_crit_edge], [ 0, %entry ] +  store i32 %add.res, i32* @Y + +  ret void +}  | 

