diff options
| -rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 134 | ||||
| -rw-r--r-- | llvm/test/CodeGen/PowerPC/tail-dup-layout.ll | 81 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/cmovcmov.ll | 19 | 
3 files changed, 228 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 74a64631cb9..cba1767281a 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -50,6 +50,7 @@  #include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetSubtargetInfo.h"  #include <algorithm> +#include <forward_list>  #include <functional>  #include <utility>  using namespace llvm; @@ -143,6 +144,14 @@ static cl::opt<unsigned> TailDupPlacementPenalty(      cl::init(2),      cl::Hidden); +// Heuristic for triangle chains. +static cl::opt<unsigned> TriangleChainCount( +    "triangle-chain-count", +    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " +             "triangle tail duplication heuristic to kick in. 0 to disable."), +    cl::init(3), +    cl::Hidden); +  extern cl::opt<unsigned> StaticLikelyProb;  extern cl::opt<unsigned> ProfileLikelyProb; @@ -456,6 +465,9 @@ class MachineBlockPlacement : public MachineFunctionPass {    bool canTailDuplicateUnplacedPreds(        const MachineBasicBlock *BB, MachineBasicBlock *Succ,        const BlockChain &Chain, const BlockFilterSet *BlockFilter); +  /// Find chains of triangles to tail-duplicate where a global analysis works, +  /// but a local analysis would not find them. +  void precomputeTriangleChains();  public:    static char ID; // Pass identification, replacement for typeid @@ -1041,6 +1053,127 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(    return true;  } +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +///    longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +///    U-shaped distribution. +///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +///    If the branches in a chain are likely to be from the same side of the +///    distribution as their predecessor, but are independent at runtime, this +///    transformation is profitable. (Because the cost of being wrong is a small +///    fixed cost, unlike the standard triangle layout where the cost of being +///    wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +///    branch was taken positively influences whether the next branch will be +///    taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { +  struct TriangleChain { +    unsigned Count; +    std::forward_list<MachineBasicBlock*> Edges; +    TriangleChain(MachineBasicBlock* src, MachineBasicBlock *dst) { +      Edges.push_front(src); +      Edges.push_front(dst); +      Count = 1; +    } + +    void append(MachineBasicBlock *dst) { +      assert(!Edges.empty() && Edges.front()->isSuccessor(dst) && +             "Attempting to append a block that is not a successor."); +      Edges.push_front(dst); +      ++Count; +    } + +    MachineBasicBlock *getKey() { +      return Edges.front(); +    } +  }; + +  if (TriangleChainCount == 0) +    return; + +  DEBUG(dbgs() << "Pre-computing triangle chains.\n"); +  // Map from last block to the chain that contains it. This allows us to extend +  // chains as we find new triangles. +  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; +  for (MachineBasicBlock &BB : *F) { +    // If BB doesn't have 2 successors, it doesn't start a triangle. +    if (BB.succ_size() != 2) +      continue; +    MachineBasicBlock *PDom = nullptr; +    for (MachineBasicBlock *Succ : BB.successors()) { +      if (!MPDT->dominates(Succ, &BB)) +        continue; +      PDom = Succ; +      break; +    } +    // If BB doesn't have a post-dominating successor, it doesn't form a +    // triangle. +    if (PDom == nullptr) +      continue; +    // If PDom has a hint that it is low probability, skip this triangle. +    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) +      continue; +    // If PDom isn't eligible for duplication, this isn't the kind of triangle +    // we're looking for. +    if (!shouldTailDuplicate(PDom)) +      continue; +    bool CanTailDuplicate = true; +    // If PDom can't tail-duplicate into it's non-BB predecessors, then this +    // isn't the kind of triangle we're looking for. +    for (MachineBasicBlock* Pred : PDom->predecessors()) { +      if (Pred == &BB) +        continue; +      if (!TailDup.canTailDuplicate(PDom, Pred)) { +        CanTailDuplicate = false; +        break; +      } +    } +    // If we can't tail-duplicate PDom to its predecessors, then skip this +    // triangle. +    if (!CanTailDuplicate) +      continue; + +    // Now we have an interesting triangle. Insert it if it's not part of an +    // existing chain +    // Note: This cannot be replaced with a call insert() or emplace() because +    // the find key is BB, but the insert/emplace key is PDom. +    auto Found = TriangleChainMap.find(&BB); +    // If it is, remove the chain from the map, grow it, and put it back in the +    // map with the end as the new key. +    if (Found != TriangleChainMap.end()) { +      TriangleChain Chain = std::move(Found->second); +      TriangleChainMap.erase(Found); +      Chain.append(PDom); +      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); +    } else { +      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); +      assert (InsertResult.second && "Block seen twice."); +      (void) InsertResult; +    } +  } + +  for (auto &ChainPair : TriangleChainMap) { +    TriangleChain &Chain = ChainPair.second; +    // Benchmarking has shown that due to branch correlation duplicating 2 or +    // more triangles is profitable, despite the calculations assuming +    // independence. +    if (Chain.Count < TriangleChainCount) +      continue; +    MachineBasicBlock *dst = Chain.Edges.front(); +    Chain.Edges.pop_front(); +    for (MachineBasicBlock *src : Chain.Edges) { +      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << +            getBlockName(dst) << " as pre-computed based on triangles.\n"); +      ComputedEdges[src] = { dst, true }; +      dst = src; +    } +  } +} +  // When profile is not present, return the StaticLikelyProb.  // When profile is available, we need to handle the triangle-shape CFG.  static BranchProbability getLayoutSuccessorProbThreshold( @@ -2504,6 +2637,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {      if (MF.getFunction()->optForSize())        TailDupSize = 1;      TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); +    precomputeTriangleChains();    }    assert(BlockToChain.empty()); diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll index 95c23ade513..e0979d89473 100644 --- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll +++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -95,6 +95,87 @@ exit:  }  ; Intended layout: +; The chain-of-triangles based duplicating produces the layout +; test1 +; test2 +; test3 +; test4 +; optional1 +; optional2 +; optional3 +; optional4 +; exit +; even for 50/50 branches. +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: straight_test_50: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]] +;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: .[[OPT1LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-NEXT: .[[OPT2LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, .[[TEST4LABEL]] +;CHECK-NEXT: .[[OPT3LABEL]]: +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-NEXT: .[[OPT4LABEL]]: +;CHECK: b .[[EXITLABEL]] + +define void @straight_test_50(i32 %tag) { +entry: +  br label %test1 +test1: +  %tagbit1 = and i32 %tag, 1 +  %tagbit1eq0 = icmp eq i32 %tagbit1, 0 +  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: +  call void @a() +  br label %test2 +test2: +  %tagbit2 = and i32 %tag, 2 +  %tagbit2eq0 = icmp eq i32 %tagbit2, 0 +  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: +  call void @b() +  br label %test3 +test3: +  %tagbit3 = and i32 %tag, 4 +  %tagbit3eq0 = icmp eq i32 %tagbit3, 0 +  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !2 +optional3: +  call void @c() +  br label %test4 +test4: +  %tagbit4 = and i32 %tag, 8 +  %tagbit4eq0 = icmp eq i32 %tagbit4, 0 +  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1 +optional4: +  call void @d() +  br label %exit +exit: +  ret void +} + +; Intended layout:  ; The chain-based outlining produces the layout  ; entry  ; --- Begin loop --- diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll index 38ba308ecff..5b984d27249 100644 --- a/llvm/test/CodeGen/X86/cmovcmov.ll +++ b/llvm/test/CodeGen/X86/cmovcmov.ll @@ -249,16 +249,23 @@ attributes #0 = { nounwind }  ; CMOV-DAG: cmpl %edx, %esi  ; CMOV-DAG: movb $20, %al  ; CMOV-DAG: movb $20, %dl -; CMOV:   jl [[BB0:.LBB[0-9_]+]] +; CMOV:   jge [[BB2:.LBB[0-9_]+]] +; CMOV:   jle [[BB3:.LBB[0-9_]+]] +; CMOV: [[BB0:.LBB[0-9_]+]] +; CMOV:   testl %edi, %edi +; CMOV:   jne [[BB4:.LBB[0-9_]+]] +; CMOV: [[BB1:.LBB[0-9_]+]] +; CMOV:   movb %al, g8(%rip) +; CMOV:   retq +; CMOV: [[BB2]]:  ; CMOV:   movl %ecx, %edx -; CMOV: [[BB0]]: -; CMOV:   jg [[BB1:.LBB[0-9_]+]] +; CMOV:   jg [[BB0]] +; CMOV: [[BB3]]:  ; CMOV:   movl %edx, %eax -; CMOV: [[BB1]]:  ; CMOV:   testl %edi, %edi -; CMOV:   je [[BB2:.LBB[0-9_]+]] +; CMOV:   je [[BB1]] +; CMOV: [[BB4]]:  ; CMOV:   movl %edx, %eax -; CMOV: [[BB2]]:  ; CMOV:   movb %al, g8(%rip)  ; CMOV:   retq  define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {  | 

