diff options
| -rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 136 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/block-placement.ll | 129 | 
2 files changed, 196 insertions, 69 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 22d7212007f..6226237c6d4 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -102,13 +102,13 @@ public:    }    /// \brief Iterator over blocks within the chain. -  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator iterator; +  typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;    /// \brief Beginning of blocks within the chain. -  iterator begin() const { return Blocks.begin(); } +  iterator begin() { return Blocks.begin(); }    /// \brief End of blocks within the chain. -  iterator end() const { return Blocks.end(); } +  iterator end() { return Blocks.end(); }    /// \brief Merge a block chain into this one.    /// @@ -211,12 +211,11 @@ class MachineBlockPlacement : public MachineFunctionPass {    void buildChain(MachineBasicBlock *BB, BlockChain &Chain,                    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,                    const BlockFilterSet *BlockFilter = 0); -  MachineBasicBlock *findBestLoopTop(MachineFunction &F, -                                     MachineLoop &L, -                                     const BlockFilterSet &LoopBlockSet); +  MachineBasicBlock *findBestLoopExit(MachineFunction &F, +                                      MachineLoop &L, +                                      const BlockFilterSet &LoopBlockSet);    void buildLoopChains(MachineFunction &F, MachineLoop &L);    void buildCFGChains(MachineFunction &F); -  void AlignLoops(MachineFunction &F);  public:    static char ID; // Pass identification, replacement for typeid @@ -544,9 +543,9 @@ void MachineBlockPlacement::buildChain(  /// block to layout at the top of the loop. Typically this is done to maximize  /// fallthrough opportunities.  MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(MachineFunction &F, -                                       MachineLoop &L, -                                       const BlockFilterSet &LoopBlockSet) { +MachineBlockPlacement::findBestLoopExit(MachineFunction &F, +                                        MachineLoop &L, +                                        const BlockFilterSet &LoopBlockSet) {    // We don't want to layout the loop linearly in all cases. If the loop header    // is just a normal basic block in the loop, we want to look for what block    // within the loop is the best one to layout at the top. However, if the loop @@ -557,11 +556,11 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,    // header and only rotate if safe.    BlockChain &HeaderChain = *BlockToChain[L.getHeader()];    if (!LoopBlockSet.count(*HeaderChain.begin())) -    return L.getHeader(); +    return 0;    BlockFrequency BestExitEdgeFreq; +  unsigned BestExitLoopDepth = 0;    MachineBasicBlock *ExitingBB = 0; -  MachineBasicBlock *LoopingBB = 0;    // If there are exits to outer loops, loop rotation can severely limit    // fallthrough opportunites unless it selects such an exit. Keep a set of    // blocks where rotating to exit with that block will reach an outer loop. @@ -584,15 +583,10 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,      // successor isn't found.      MachineBasicBlock *OldExitingBB = ExitingBB;      BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; -    // We also compute and store the best looping successor for use in layout. -    MachineBasicBlock *BestLoopSucc = 0; +    bool HasLoopingSucc = false;      // FIXME: Due to the performance of the probability and weight routines in -    // the MBPI analysis, we use the internal weights. This is only valid -    // because it is purely a ranking function, we don't care about anything -    // but the relative values. -    uint32_t BestLoopSuccWeight = 0; -    // FIXME: We also manually compute the probabilities to avoid quadratic -    // behavior. +    // the MBPI analysis, we use the internal weights and manually compute the +    // probabilities to avoid quadratic behavior.      uint32_t WeightScale = 0;      uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);      for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(), @@ -604,10 +598,8 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,          continue;        BlockChain &SuccChain = *BlockToChain[*SI];        // Don't split chains, either this chain or the successor's chain. -      if (&Chain == &SuccChain || *SI != *SuccChain.begin()) { -        DEBUG(dbgs() << "    " << (LoopBlockSet.count(*SI) ? "looping: " -                                                           : "exiting: ") -                     << getBlockName(*I) << " -> " +      if (&Chain == &SuccChain) { +        DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "                       << getBlockName(*SI) << " (chain conflict)\n");          continue;        } @@ -616,60 +608,55 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,        if (LoopBlockSet.count(*SI)) {          DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "                       << getBlockName(*SI) << " (" << SuccWeight << ")\n"); -        if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight) -          continue; - -        BestLoopSucc = *SI; -        BestLoopSuccWeight = SuccWeight; +        HasLoopingSucc = true;          continue;        } +      unsigned SuccLoopDepth = 0; +      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) { +        SuccLoopDepth = ExitLoop->getLoopDepth(); +        if (ExitLoop->contains(&L)) +          BlocksExitingToOuterLoop.insert(*I); +      } +        BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);        BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;        DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> " -                   << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n"); +                   << getBlockName(*SI) << " [L:" << SuccLoopDepth +                   << "] (" << ExitEdgeFreq << ")\n");        // Note that we slightly bias this toward an existing layout successor to        // retain incoming order in the absence of better information.        // FIXME: Should we bias this more strongly? It's pretty weak. -      if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq || +      if (!ExitingBB || BestExitLoopDepth < SuccLoopDepth || +          ExitEdgeFreq > BestExitEdgeFreq ||            ((*I)->isLayoutSuccessor(*SI) &&             !(ExitEdgeFreq < BestExitEdgeFreq))) {          BestExitEdgeFreq = ExitEdgeFreq;          ExitingBB = *I;        } - -      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) -        if (ExitLoop->contains(&L)) -          BlocksExitingToOuterLoop.insert(*I);      }      // Restore the old exiting state, no viable looping successor was found. -    if (!BestLoopSucc) { +    if (!HasLoopingSucc) {        ExitingBB = OldExitingBB;        BestExitEdgeFreq = OldBestExitEdgeFreq;        continue;      } - -    // If this was best exiting block thus far, also record the looping block. -    if (ExitingBB == *I) -      LoopingBB = BestLoopSucc;    } -  // Without a candidate exitting block or with only a single block in the +  // Without a candidate exiting block or with only a single block in the    // loop, just use the loop header to layout the loop.    if (!ExitingBB || L.getNumBlocks() == 1) -    return L.getHeader(); +    return 0;    // Also, if we have exit blocks which lead to outer loops but didn't select    // one of them as the exiting block we are rotating toward, disable loop    // rotation altogether.    if (!BlocksExitingToOuterLoop.empty() &&        !BlocksExitingToOuterLoop.count(ExitingBB)) -    return L.getHeader(); +    return 0; -  assert(LoopingBB && "All successors of a loop block are exit blocks!");    DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n"); -  DEBUG(dbgs() << "  Best top block: " << getBlockName(LoopingBB) << "\n"); -  return LoopingBB; +  return ExitingBB;  }  /// \brief Forms basic block chains from the natural loop structures. @@ -688,8 +675,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,    SmallVector<MachineBasicBlock *, 16> BlockWorkList;    BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); -  MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet); -  BlockChain &LoopChain = *BlockToChain[LayoutTop]; +  MachineBasicBlock *ExitingBB = findBestLoopExit(F, L, LoopBlockSet); +  BlockChain &LoopChain = *BlockToChain[L.getHeader()];    // FIXME: This is a really lame way of walking the chains in the loop: we    // walk the blocks, and use a set to prevent visiting a particular chain @@ -721,7 +708,18 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,        BlockWorkList.push_back(*Chain.begin());    } -  buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet); +  buildChain(L.getHeader(), LoopChain, BlockWorkList, &LoopBlockSet); + +  // Once we have built a chain, try to rotate it to line up the hot exit block +  // with fallthrough out of the loop (if we have a valid exit block for that). +  if (ExitingBB) { +    BlockChain::iterator ExitIt = std::find(LoopChain.begin(), LoopChain.end(), +                                            ExitingBB); + +    if (ExitIt != LoopChain.end()) { +      std::rotate(LoopChain.begin(), llvm::next(ExitIt), LoopChain.end()); +    } +  }    DEBUG({      // Crash at the end so we get all of the debugging output first. @@ -733,7 +731,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";      }      for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end(); -         BCI != BCE; ++BCI) +         BCI != BCE; ++BCI) { +      dbgs() << "          ... " << getBlockName(*BCI) << "\n";        if (!LoopBlockSet.erase(*BCI)) {          // We don't mark the loop as bad here because there are real situations          // where this can occur. For example, with an unanalyzable fallthrough @@ -743,6 +742,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,                 << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"                 << "  Bad block:    " << getBlockName(*BCI) << "\n";        } +    }      if (!LoopBlockSet.empty()) {        BadLoop = true; @@ -882,28 +882,33 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.    if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))      F.back().updateTerminator(); -} - -/// \brief Recursive helper to align a loop and any nested loops. -static void AlignLoop(MachineFunction &F, MachineLoop *L, unsigned Align) { -  // Recurse through nested loops. -  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) -    AlignLoop(F, *I, Align); -  L->getTopBlock()->setAlignment(Align); -} - -/// \brief Align loop headers to target preferred alignments. -void MachineBlockPlacement::AlignLoops(MachineFunction &F) { +  // Walk through the backedges of the function now that we have fully laid out +  // the basic blocks and align the destination of each backedge. We don't rely +  // on the loop info here so that we can align backedges in unnatural CFGs and +  // backedges that were introduced purely because of the loop rotations done +  // during this layout pass. +  // FIXME: This isn't quite right, we shouldn't align backedges that result +  // from blocks being sunken below the exit block for the function.    if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))      return; -    unsigned Align = TLI->getPrefLoopAlignment();    if (!Align)      return;  // Don't care about loop alignment. -  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I) -    AlignLoop(F, *I, Align); +  SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks; +  for (BlockChain::iterator BI = FunctionChain.begin(), +                            BE = FunctionChain.end(); +       BI != BE; ++BI) { +    PreviousBlocks.insert(*BI); +    // Set alignment on the destination of all the back edges in the new +    // ordering. +    for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(), +                                          SE = (*BI)->succ_end(); +         SI != SE; ++SI) +      if (PreviousBlocks.count(*SI)) +        (*SI)->setAlignment(Align); +  }  }  bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) { @@ -919,7 +924,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {    assert(BlockToChain.empty());    buildCFGChains(F); -  AlignLoops(F);    BlockToChain.clear();    ChainAllocator.DestroyAll(); diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll index 167d522d47d..f3c97276051 100644 --- a/llvm/test/CodeGen/X86/block-placement.ll +++ b/llvm/test/CodeGen/X86/block-placement.ll @@ -76,11 +76,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {  ; Check that we sink cold loop blocks after the hot loop body.  ; CHECK: test_loop_cold_blocks:  ; CHECK: %entry +; CHECK: %unlikely1 +; CHECK: %unlikely2  ; CHECK: %body1  ; CHECK: %body2  ; CHECK: %body3 -; CHECK: %unlikely1 -; CHECK: %unlikely2  ; CHECK: %exit  entry: @@ -348,7 +348,6 @@ define void @unnatural_cfg2() {  ; CHECK: %entry  ; CHECK: %loop.body1  ; CHECK: %loop.body2 -; CHECK: %loop.header  ; CHECK: %loop.body3  ; CHECK: %loop.inner1.begin  ; The end block is folded with %loop.body3... @@ -356,6 +355,7 @@ define void @unnatural_cfg2() {  ; CHECK: %loop.body4  ; CHECK: %loop.inner2.begin  ; The loop.inner2.end block is folded +; CHECK: %loop.header  ; CHECK: %bail  entry: @@ -928,3 +928,126 @@ entry:  exit:    ret void  } + +define void @benchmark_heapsort(i32 %n, double* nocapture %ra) { +; This test case comes from the heapsort benchmark, and exemplifies several +; important aspects to block placement in the presence of loops: +; 1) Loop rotation needs to *ensure* that the desired exiting edge can be +;    a fallthrough. +; 2) The exiting edge from the loop which is rotated to be laid out at the +;    bottom of the loop needs to be exiting into the nearest enclosing loop (to +;    which there is an exit). Otherwise, we force that enclosing loop into +;    strange layouts that are siginificantly less efficient, often times maing +;    it discontiguous. +; +; CHECK: @benchmark_heapsort +; CHECK: %entry +; First rotated loop top. +; CHECK: .align +; CHECK: %if.end10 +; Second rotated loop top +; CHECK: .align +; CHECK: %while.cond.outer +; Third rotated loop top +; CHECK: .align +; CHECK: %while.cond +; CHECK: %while.body +; CHECK: %land.lhs.true +; CHECK: %if.then19 +; CHECK: %if.then19 +; CHECK: %if.then24 +; CHECK: %while.end +; CHECK: %for.cond +; CHECK: %if.then +; CHECK: %if.else +; CHECK: %if.then8 +; CHECK: ret + +entry: +  %shr = ashr i32 %n, 1 +  %add = add nsw i32 %shr, 1 +  %arrayidx3 = getelementptr inbounds double* %ra, i64 1 +  br label %for.cond + +for.cond: +  %ir.0 = phi i32 [ %n, %entry ], [ %ir.1, %while.end ] +  %l.0 = phi i32 [ %add, %entry ], [ %l.1, %while.end ] +  %cmp = icmp sgt i32 %l.0, 1 +  br i1 %cmp, label %if.then, label %if.else + +if.then: +  %dec = add nsw i32 %l.0, -1 +  %idxprom = sext i32 %dec to i64 +  %arrayidx = getelementptr inbounds double* %ra, i64 %idxprom +  %0 = load double* %arrayidx, align 8 +  br label %if.end10 + +if.else: +  %idxprom1 = sext i32 %ir.0 to i64 +  %arrayidx2 = getelementptr inbounds double* %ra, i64 %idxprom1 +  %1 = load double* %arrayidx2, align 8 +  %2 = load double* %arrayidx3, align 8 +  store double %2, double* %arrayidx2, align 8 +  %dec6 = add nsw i32 %ir.0, -1 +  %cmp7 = icmp eq i32 %dec6, 1 +  br i1 %cmp7, label %if.then8, label %if.end10 + +if.then8: +  store double %1, double* %arrayidx3, align 8 +  ret void + +if.end10: +  %ir.1 = phi i32 [ %ir.0, %if.then ], [ %dec6, %if.else ] +  %l.1 = phi i32 [ %dec, %if.then ], [ %l.0, %if.else ] +  %rra.0 = phi double [ %0, %if.then ], [ %1, %if.else ] +  %add31 = add nsw i32 %ir.1, 1 +  br label %while.cond.outer + +while.cond.outer: +  %j.0.ph.in = phi i32 [ %l.1, %if.end10 ], [ %j.1, %if.then24 ] +  %j.0.ph = shl i32 %j.0.ph.in, 1 +  br label %while.cond + +while.cond: +  %j.0 = phi i32 [ %add31, %if.end20 ], [ %j.0.ph, %while.cond.outer ] +  %cmp11 = icmp sgt i32 %j.0, %ir.1 +  br i1 %cmp11, label %while.end, label %while.body + +while.body: +  %cmp12 = icmp slt i32 %j.0, %ir.1 +  br i1 %cmp12, label %land.lhs.true, label %if.end20 + +land.lhs.true: +  %idxprom13 = sext i32 %j.0 to i64 +  %arrayidx14 = getelementptr inbounds double* %ra, i64 %idxprom13 +  %3 = load double* %arrayidx14, align 8 +  %add15 = add nsw i32 %j.0, 1 +  %idxprom16 = sext i32 %add15 to i64 +  %arrayidx17 = getelementptr inbounds double* %ra, i64 %idxprom16 +  %4 = load double* %arrayidx17, align 8 +  %cmp18 = fcmp olt double %3, %4 +  br i1 %cmp18, label %if.then19, label %if.end20 + +if.then19: +  br label %if.end20 + +if.end20: +  %j.1 = phi i32 [ %add15, %if.then19 ], [ %j.0, %land.lhs.true ], [ %j.0, %while.body ] +  %idxprom21 = sext i32 %j.1 to i64 +  %arrayidx22 = getelementptr inbounds double* %ra, i64 %idxprom21 +  %5 = load double* %arrayidx22, align 8 +  %cmp23 = fcmp olt double %rra.0, %5 +  br i1 %cmp23, label %if.then24, label %while.cond + +if.then24: +  %idxprom27 = sext i32 %j.0.ph.in to i64 +  %arrayidx28 = getelementptr inbounds double* %ra, i64 %idxprom27 +  store double %5, double* %arrayidx28, align 8 +  br label %while.cond.outer + +while.end: +  %idxprom33 = sext i32 %j.0.ph.in to i64 +  %arrayidx34 = getelementptr inbounds double* %ra, i64 %idxprom33 +  store double %rra.0, double* %arrayidx34, align 8 +  br label %for.cond +}  | 

