2 files changed, 196 insertions, 69 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 22d7212007f..6226237c6d4 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -102,13 +102,13 @@ public:
   }
 
   /// \brief Iterator over blocks within the chain.
-  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
 
   /// \brief Beginning of blocks within the chain.
-  iterator begin() const { return Blocks.begin(); }
+  iterator begin() { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
-  iterator end() const { return Blocks.end(); }
+  iterator end() { return Blocks.end(); }
 
   /// \brief Merge a block chain into this one.
   ///
@@ -211,12 +211,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   const BlockFilterSet *BlockFilter = 0);
-  MachineBasicBlock *findBestLoopTop(MachineFunction &F,
-                                     MachineLoop &L,
-                                     const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(MachineFunction &F,
+                                      MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
   void buildCFGChains(MachineFunction &F);
-  void AlignLoops(MachineFunction &F);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -544,9 +543,9 @@ void MachineBlockPlacement::buildChain(
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
-                                       MachineLoop &L,
-                                       const BlockFilterSet &LoopBlockSet) {
+MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
+                                        MachineLoop &L,
+                                        const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
   // within the loop is the best one to layout at the top. However, if the loop
@@ -557,11 +556,11 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
   // header and only rotate if safe.
   BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
   if (!LoopBlockSet.count(*HeaderChain.begin()))
-    return L.getHeader();
+    return 0;
 
   BlockFrequency BestExitEdgeFreq;
+  unsigned BestExitLoopDepth = 0;
   MachineBasicBlock *ExitingBB = 0;
-  MachineBasicBlock *LoopingBB = 0;
   // If there are exits to outer loops, loop rotation can severely limit
   // fallthrough opportunites unless it selects such an exit. Keep a set of
   // blocks where rotating to exit with that block will reach an outer loop.
@@ -584,15 +583,10 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
     // successor isn't found.
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
-    // We also compute and store the best looping successor for use in layout.
-    MachineBasicBlock *BestLoopSucc = 0;
+    bool HasLoopingSucc = false;
     // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights. This is only valid
-    // because it is purely a ranking function, we don't care about anything
-    // but the relative values.
-    uint32_t BestLoopSuccWeight = 0;
-    // FIXME: We also manually compute the probabilities to avoid quadratic
-    // behavior.
+    // the MBPI analysis, we use the internal weights and manually compute the
+    // probabilities to avoid quadratic behavior.
     uint32_t WeightScale = 0;
     uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
     for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
@@ -604,10 +598,8 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
         continue;
       BlockChain &SuccChain = *BlockToChain[*SI];
       // Don't split chains, either this chain or the successor's chain.
-      if (&Chain == &SuccChain || *SI != *SuccChain.begin()) {
-        DEBUG(dbgs() << "    " << (LoopBlockSet.count(*SI) ? "looping: "
-                                                           : "exiting: ")
-                     << getBlockName(*I) << " -> "
+      if (&Chain == &SuccChain) {
+        DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
                      << getBlockName(*SI) << " (chain conflict)\n");
         continue;
       }
@@ -616,60 +608,55 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
       if (LoopBlockSet.count(*SI)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "
                      << getBlockName(*SI) << " (" << SuccWeight << ")\n");
-        if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight)
-          continue;
-
-        BestLoopSucc = *SI;
-        BestLoopSuccWeight = SuccWeight;
+        HasLoopingSucc = true;
         continue;
       }
 
+      unsigned SuccLoopDepth = 0;
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) {
+        SuccLoopDepth = ExitLoop->getLoopDepth();
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(*I);
+      }
+
       BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
-                   << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n");
+                   << getBlockName(*SI) << " [L:" << SuccLoopDepth
+                   << "] (" << ExitEdgeFreq << ")\n");
       // Note that we slightly bias this toward an existing layout successor to
       // retain incoming order in the absence of better information.
       // FIXME: Should we bias this more strongly? It's pretty weak.
-      if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq ||
+      if (!ExitingBB || BestExitLoopDepth < SuccLoopDepth ||
+          ExitEdgeFreq > BestExitEdgeFreq ||
           ((*I)->isLayoutSuccessor(*SI) &&
            !(ExitEdgeFreq < BestExitEdgeFreq))) {
         BestExitEdgeFreq = ExitEdgeFreq;
         ExitingBB = *I;
       }
-
-      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI))
-        if (ExitLoop->contains(&L))
-          BlocksExitingToOuterLoop.insert(*I);
     }
 
     // Restore the old exiting state, no viable looping successor was found.
-    if (!BestLoopSucc) {
+    if (!HasLoopingSucc) {
       ExitingBB = OldExitingBB;
       BestExitEdgeFreq = OldBestExitEdgeFreq;
       continue;
     }
-
-    // If this was best exiting block thus far, also record the looping block.
-    if (ExitingBB == *I)
-      LoopingBB = BestLoopSucc;
   }
-  // Without a candidate exitting block or with only a single block in the
+  // Without a candidate exiting block or with only a single block in the
   // loop, just use the loop header to layout the loop.
   if (!ExitingBB || L.getNumBlocks() == 1)
-    return L.getHeader();
+    return 0;
 
   // Also, if we have exit blocks which lead to outer loops but didn't select
   // one of them as the exiting block we are rotating toward, disable loop
   // rotation altogether.
   if (!BlocksExitingToOuterLoop.empty() &&
       !BlocksExitingToOuterLoop.count(ExitingBB))
-    return L.getHeader();
+    return 0;
 
-  assert(LoopingBB && "All successors of a loop block are exit blocks!");
   DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
-  DEBUG(dbgs() << "  Best top block: " << getBlockName(LoopingBB) << "\n");
-  return LoopingBB;
+  return ExitingBB;
 }
 
 /// \brief Forms basic block chains from the natural loop structures.
@@ -688,8 +675,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
 
-  MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet);
-  BlockChain &LoopChain = *BlockToChain[LayoutTop];
+  MachineBasicBlock *ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
+  BlockChain &LoopChain = *BlockToChain[L.getHeader()];
 
   // FIXME: This is a really lame way of walking the chains in the loop: we
   // walk the blocks, and use a set to prevent visiting a particular chain
@@ -721,7 +708,18 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
       BlockWorkList.push_back(*Chain.begin());
   }
 
-  buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet);
+  buildChain(L.getHeader(), LoopChain, BlockWorkList, &LoopBlockSet);
+
+  // Once we have built a chain, try to rotate it to line up the hot exit block
+  // with fallthrough out of the loop (if we have a valid exit block for that).
+  if (ExitingBB) {
+    BlockChain::iterator ExitIt = std::find(LoopChain.begin(), LoopChain.end(),
+                                            ExitingBB);
+
+    if (ExitIt != LoopChain.end()) {
+      std::rotate(LoopChain.begin(), llvm::next(ExitIt), LoopChain.end());
+    }
+  }
 
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
@@ -733,7 +731,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
              << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
     }
     for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end();
-         BCI != BCE; ++BCI)
+         BCI != BCE; ++BCI) {
+      dbgs() << "          ... " << getBlockName(*BCI) << "\n";
       if (!LoopBlockSet.erase(*BCI)) {
         // We don't mark the loop as bad here because there are real situations
         // where this can occur. For example, with an unanalyzable fallthrough
@@ -743,6 +742,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
                << "  Bad block:    " << getBlockName(*BCI) << "\n";
       }
+    }
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
@@ -882,28 +882,33 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
   if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
     F.back().updateTerminator();
-}
-
-/// \brief Recursive helper to align a loop and any nested loops.
-static void AlignLoop(MachineFunction &F, MachineLoop *L, unsigned Align) {
-  // Recurse through nested loops.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    AlignLoop(F, *I, Align);
 
-  L->getTopBlock()->setAlignment(Align);
-}
-
-/// \brief Align loop headers to target preferred alignments.
-void MachineBlockPlacement::AlignLoops(MachineFunction &F) {
+  // Walk through the backedges of the function now that we have fully laid out
+  // the basic blocks and align the destination of each backedge. We don't rely
+  // on the loop info here so that we can align backedges in unnatural CFGs and
+  // backedges that were introduced purely because of the loop rotations done
+  // during this layout pass.
+  // FIXME: This isn't quite right, we shouldn't align backedges that result
+  // from blocks being sunken below the exit block for the function.
   if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
     return;
-
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
     return;  // Don't care about loop alignment.
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
-    AlignLoop(F, *I, Align);
+  SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks;
+  for (BlockChain::iterator BI = FunctionChain.begin(),
+                            BE = FunctionChain.end();
+       BI != BE; ++BI) {
+    PreviousBlocks.insert(*BI);
+    // Set alignment on the destination of all the back edges in the new
+    // ordering.
+    for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(),
+                                          SE = (*BI)->succ_end();
+         SI != SE; ++SI)
+      if (PreviousBlocks.count(*SI))
+        (*SI)->setAlignment(Align);
+  }
 }
 
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
@@ -919,7 +924,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
   assert(BlockToChain.empty());
 
   buildCFGChains(F);
-  AlignLoops(F);
 
   BlockToChain.clear();
   ChainAllocator.DestroyAll();
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 167d522d47d..f3c97276051 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -76,11 +76,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
 ; Check that we sink cold loop blocks after the hot loop body.
 ; CHECK: test_loop_cold_blocks:
 ; CHECK: %entry
+; CHECK: %unlikely1
+; CHECK: %unlikely2
 ; CHECK: %body1
 ; CHECK: %body2
 ; CHECK: %body3
-; CHECK: %unlikely1
-; CHECK: %unlikely2
 ; CHECK: %exit
 
 entry:
@@ -348,7 +348,6 @@ define void @unnatural_cfg2() {
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
-; CHECK: %loop.header
 ; CHECK: %loop.body3
 ; CHECK: %loop.inner1.begin
 ; The end block is folded with %loop.body3...
@@ -356,6 +355,7 @@ define void @unnatural_cfg2() {
 ; CHECK: %loop.body4
 ; CHECK: %loop.inner2.begin
 ; The loop.inner2.end block is folded
+; CHECK: %loop.header
 ; CHECK: %bail
 
 entry:
@@ -928,3 +928,126 @@ entry:
 exit:
   ret void
 }
+
+define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
+; This test case comes from the heapsort benchmark, and exemplifies several
+; important aspects to block placement in the presence of loops:
+; 1) Loop rotation needs to *ensure* that the desired exiting edge can be
+;    a fallthrough.
+; 2) The exiting edge from the loop which is rotated to be laid out at the
+;    bottom of the loop needs to be exiting into the nearest enclosing loop (to
+;    which there is an exit). Otherwise, we force that enclosing loop into
+;    strange layouts that are siginificantly less efficient, often times maing
+;    it discontiguous.
+;
+; CHECK: @benchmark_heapsort
+; CHECK: %entry
+; First rotated loop top.
+; CHECK: .align
+; CHECK: %if.end10
+; Second rotated loop top
+; CHECK: .align
+; CHECK: %while.cond.outer
+; Third rotated loop top
+; CHECK: .align
+; CHECK: %while.cond
+; CHECK: %while.body
+; CHECK: %land.lhs.true
+; CHECK: %if.then19
+; CHECK: %if.then19
+; CHECK: %if.then24
+; CHECK: %while.end
+; CHECK: %for.cond
+; CHECK: %if.then
+; CHECK: %if.else
+; CHECK: %if.then8
+; CHECK: ret
+
+entry:
+  %shr = ashr i32 %n, 1
+  %add = add nsw i32 %shr, 1
+  %arrayidx3 = getelementptr inbounds double* %ra, i64 1
+  br label %for.cond
+
+for.cond:
+  %ir.0 = phi i32 [ %n, %entry ], [ %ir.1, %while.end ]
+  %l.0 = phi i32 [ %add, %entry ], [ %l.1, %while.end ]
+  %cmp = icmp sgt i32 %l.0, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %dec = add nsw i32 %l.0, -1
+  %idxprom = sext i32 %dec to i64
+  %arrayidx = getelementptr inbounds double* %ra, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+  br label %if.end10
+
+if.else:
+  %idxprom1 = sext i32 %ir.0 to i64
+  %arrayidx2 = getelementptr inbounds double* %ra, i64 %idxprom1
+  %1 = load double* %arrayidx2, align 8
+  %2 = load double* %arrayidx3, align 8
+  store double %2, double* %arrayidx2, align 8
+  %dec6 = add nsw i32 %ir.0, -1
+  %cmp7 = icmp eq i32 %dec6, 1
+  br i1 %cmp7, label %if.then8, label %if.end10
+
+if.then8:
+  store double %1, double* %arrayidx3, align 8
+  ret void
+
+if.end10:
+  %ir.1 = phi i32 [ %ir.0, %if.then ], [ %dec6, %if.else ]
+  %l.1 = phi i32 [ %dec, %if.then ], [ %l.0, %if.else ]
+  %rra.0 = phi double [ %0, %if.then ], [ %1, %if.else ]
+  %add31 = add nsw i32 %ir.1, 1
+  br label %while.cond.outer
+
+while.cond.outer:
+  %j.0.ph.in = phi i32 [ %l.1, %if.end10 ], [ %j.1, %if.then24 ]
+  %j.0.ph = shl i32 %j.0.ph.in, 1
+  br label %while.cond
+
+while.cond:
+  %j.0 = phi i32 [ %add31, %if.end20 ], [ %j.0.ph, %while.cond.outer ]
+  %cmp11 = icmp sgt i32 %j.0, %ir.1
+  br i1 %cmp11, label %while.end, label %while.body
+
+while.body:
+  %cmp12 = icmp slt i32 %j.0, %ir.1
+  br i1 %cmp12, label %land.lhs.true, label %if.end20
+
+land.lhs.true:
+  %idxprom13 = sext i32 %j.0 to i64
+  %arrayidx14 = getelementptr inbounds double* %ra, i64 %idxprom13
+  %3 = load double* %arrayidx14, align 8
+  %add15 = add nsw i32 %j.0, 1
+  %idxprom16 = sext i32 %add15 to i64
+  %arrayidx17 = getelementptr inbounds double* %ra, i64 %idxprom16
+  %4 = load double* %arrayidx17, align 8
+  %cmp18 = fcmp olt double %3, %4
+  br i1 %cmp18, label %if.then19, label %if.end20
+
+if.then19:
+  br label %if.end20
+
+if.end20:
+  %j.1 = phi i32 [ %add15, %if.then19 ], [ %j.0, %land.lhs.true ], [ %j.0, %while.body ]
+  %idxprom21 = sext i32 %j.1 to i64
+  %arrayidx22 = getelementptr inbounds double* %ra, i64 %idxprom21
+  %5 = load double* %arrayidx22, align 8
+  %cmp23 = fcmp olt double %rra.0, %5
+  br i1 %cmp23, label %if.then24, label %while.cond
+
+if.then24:
+  %idxprom27 = sext i32 %j.0.ph.in to i64
+  %arrayidx28 = getelementptr inbounds double* %ra, i64 %idxprom27
+  store double %5, double* %arrayidx28, align 8
+  br label %while.cond.outer
+
+while.end:
+  %idxprom33 = sext i32 %j.0.ph.in to i64
+  %arrayidx34 = getelementptr inbounds double* %ra, i64 %idxprom33
+  store double %rra.0, double* %arrayidx34, align 8
+  br label %for.cond
+}