4 files changed, 410 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index da8fac6d383..b13f6b68c42 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -76,6 +76,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit(
     "rewrite-phi-limit", cl::Hidden, cl::init(10),
     cl::desc("Limit the length of PHI chains to lookup"));
 
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 namespace {
 
   class ValueTrackerResult;
+  class RecurrenceInstr;
 
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
+    MachineLoopInfo       *MLI;
 
   public:
     static char ID; // Pass identification
@@ -150,6 +161,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
@@ -160,6 +173,9 @@ namespace {
     typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
         RewriteMapTy;
 
+    /// \brief Sequence of instructions that formulate recurrence cycle.
+    typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -170,6 +186,7 @@ namespace {
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(unsigned Reg, unsigned SubReg,
                         RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
@@ -178,6 +195,13 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    bool findTargetRecurrence(unsigned Reg,
+                              const SmallSet<unsigned, 2> &TargetReg,
+                              RecurrenceCycle &RC);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
@@ -222,6 +246,28 @@ namespace {
     }
   };
 
+  /// \brief Helper class to hold instructions that are inside recurrence
+  /// cycles. The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    typedef std::pair<unsigned, unsigned> IndexPair;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    Optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    Optional<IndexPair> CommutePair;
+  };
+
   /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
   /// returned sources for a given search and the instructions where the sources
   /// were tracked from.
@@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
@@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   return false;
 }
 
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+}
+
+bool PeepholeOptimizer::findTargetRecurrence(
+    unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
+    RecurrenceCycle &RC) {
+  // Recurrence found if Reg is in TargetRegs.
+  if (TargetRegs.count(Reg))
+    return true;
+
+  // TODO: Curerntly, we only allow the last instruction of the recurrence
+  // cycle (the instruction that feeds the PHI instruction) to have more than
+  // one uses to guarantee that commuting operands does not tie registers
+  // with overlapping live range. Once we have actual live range info of
+  // each register, this constraint can be relaxed.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  // Give up if the reccurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return false;
+
+  MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
+  unsigned Idx = MI.findRegisterUseOperandIdx(Reg);
+
+  // Only interested in recurrences whose instructions have only one def, which
+  // is a virtual register.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  MachineOperand &DefOp = MI.getOperand(0);
+  if (!isVirtualRegisterOperand(DefOp))
+    return false;
+
+  // Check if def operand of MI is tied to any use operand. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
+    return false;
+
+  if (Idx == TiedUseIdx) {
+    RC.push_back(RecurrenceInstr(&MI));
+    return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+  } else {
+    // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
+    unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
+      RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
+      return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+    }
+  }
+
+  return false;
+}
+
+/// \brief Phi instructions will eventually be lowered to copy instructions. If
+/// phi is in a loop header, a recurrence may formulated around the source and
+/// destination of the phi. For such case commuting operands of the instructions
+/// in the recurrence may enable coalescing of the copy instruction generated
+/// from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %vreg1 = phi(%vreg0, %vreg100)
+/// LoopLatch:
+///   %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1
+///
+/// , the fact that vreg0 and vreg2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
+/// %vreg2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %vreg2 and
+/// %vreg1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  SmallSet<unsigned, 2> TargetRegs;
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+    TargetRegs.insert(MO.getReg());
+  }
+
+  bool Changed = false;
+  RecurrenceCycle RC;
+  if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
+    // Commutes operands of instructions in RC if necessary so that the copy to
+    // be generated from PHI can be coalesced.
+    DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    for (auto &RI : RC) {
+      DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      auto CP = RI.getCommutePair();
+      if (CP) {
+        Changed = true;
+        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                (*CP).second);
+        DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
 
   bool Changed = false;
 
@@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
 
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
          MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isDebugValue())
           continue;
 
-      if (MI->isPosition() || MI->isPHI())
+      if (MI->isPosition())
         continue;
 
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
       if (!MI->isCopy()) {
         for (const auto &Op : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
@@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              
+
               // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
               MI = FoldMI;
@@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           }
         }
       }
-      
+
       // If we run into an instruction we can't fold across, discard
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 552a89f76ca..83c00e24d14 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule",
                    cl::desc("Coalesce copies by rescheduling (default=true)"),
                    cl::init(true), cl::Hidden);
 
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
 namespace {
 class TwoAddressInstructionPass : public MachineFunctionPass {
   MachineFunction *MF;
@@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, 3))
+  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, 3))
+  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute
diff --git a/llvm/test/CodeGen/MIR/Generic/multiRunPass.mir b/llvm/test/CodeGen/MIR/Generic/multiRunPass.mir
index bd1c0d0b458..e055c44205b 100644
--- a/llvm/test/CodeGen/MIR/Generic/multiRunPass.mir
+++ b/llvm/test/CodeGen/MIR/Generic/multiRunPass.mir
@@ -7,7 +7,8 @@
 # This test ensures that the command line accepts
 # several run passes on the same command line and
 # actually create the proper pipeline for it.
-# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt
+# PSEUDO_PEEPHOLE: -expand-isel-pseudos
+# PSEUDO_PEEPHOLE-SAME: {{(-machineverifier )?}}-peephole-opt
 # PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos
 
 # Make sure there are no other passes happening after what we asked.
diff --git a/llvm/test/CodeGen/X86/peephole-recurrence.mir b/llvm/test/CodeGen/X86/peephole-recurrence.mir
new file mode 100644
index 00000000000..af57a4fd526
--- /dev/null
+++ b/llvm/test/CodeGen/X86/peephole-recurrence.mir
@@ -0,0 +1,232 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt -o - %s | FileCheck %s
+
+--- |
+  define i32 @foo(i32 %a) {
+  bb0:
+    br label %bb1
+  
+  bb1:                                              ; preds = %bb7, %bb0
+    %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
+    %cond0 = icmp eq i32 %a, 0
+    br i1 %cond0, label %bb4, label %bb3
+  
+  bb3:                                              ; preds = %bb1
+    br label %bb4
+  
+  bb4:                                              ; preds = %bb1, %bb3
+    %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
+    %cond1 = icmp eq i32 %vreg5, 0
+    br i1 %cond1, label %bb7, label %bb6
+  
+  bb6:                                              ; preds = %bb4
+    br label %bb7
+  
+  bb7:                                              ; preds = %bb4, %bb6
+    %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
+    %vreg2 = add i32 %vreg5, %vreg0
+    %vreg3 = add i32 %vreg1, %vreg2
+    %cond2 = icmp slt i32 %vreg3, 10
+    br i1 %cond2, label %bb1, label %bb8
+  
+  bb8:                                              ; preds = %bb7
+    ret i32 0
+  }
+  
+  define i32 @bar(i32 %a, i32* %p) {
+  bb0:
+    br label %bb1
+  
+  bb1:                                              ; preds = %bb7, %bb0
+    %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
+    %cond0 = icmp eq i32 %a, 0
+    br i1 %cond0, label %bb4, label %bb3
+  
+  bb3:                                              ; preds = %bb1
+    br label %bb4
+  
+  bb4:                                              ; preds = %bb1, %bb3
+    %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
+    %cond1 = icmp eq i32 %vreg5, 0
+    br i1 %cond1, label %bb7, label %bb6
+  
+  bb6:                                              ; preds = %bb4
+    br label %bb7
+  
+  bb7:                                              ; preds = %bb4, %bb6
+    %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
+    %vreg2 = add i32 %vreg5, %vreg0
+    store i32 %vreg0, i32* %p
+    %vreg3 = add i32 %vreg1, %vreg2
+    %cond2 = icmp slt i32 %vreg3, 10
+    br i1 %cond2, label %bb1, label %bb8
+  
+  bb8:                                              ; preds = %bb7
+    ret i32 0
+  }
+
+...
+---
+# There is a recurrence formulated around %0, %10, and %3. Check that operands
+# are commuted for ADD instructions in bb.5.bb7 so that the values involved in
+# the recurrence are tied. This will remove redundant copy instruction.
+name:            foo
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr32, preferred-register: '' }
+  - { id: 1, class: gr32, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr32, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr32, preferred-register: '' }
+liveins:         
+  - { reg: '%edi', virtual-reg: '%4' }
+body:             |
+  bb.0.bb0:
+    successors: %bb.1.bb1(0x80000000)
+    liveins: %edi
+  
+    %4 = COPY %edi
+    %5 = MOV32r0 implicit-def dead %eflags
+  
+  bb.1.bb1:
+    successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
+  
+    ; CHECK: %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
+    %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
+    %6 = MOV32ri 1
+    TEST32rr %4, %4, implicit-def %eflags
+    JE_1 %bb.3.bb4, implicit %eflags
+    JMP_1 %bb.2.bb3
+  
+  bb.2.bb3:
+    successors: %bb.3.bb4(0x80000000)
+  
+    %7 = MOV32ri 2
+  
+  bb.3.bb4:
+    successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
+  
+    %1 = PHI %6, %bb.1.bb1, %7, %bb.2.bb3
+    TEST32rr %1, %1, implicit-def %eflags
+    JE_1 %bb.5.bb7, implicit %eflags
+    JMP_1 %bb.4.bb6
+  
+  bb.4.bb6:
+    successors: %bb.5.bb7(0x80000000)
+  
+    %9 = MOV32ri 2
+  
+  bb.5.bb7:
+    successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
+  
+    %2 = PHI %6, %bb.3.bb4, %9, %bb.4.bb6
+    %10 = ADD32rr %1, %0, implicit-def dead %eflags
+    ; CHECK: %10 = ADD32rr
+    ; CHECK-SAME: %0,
+    ; CHECK-SAME: %1,
+    %3 = ADD32rr %2, killed %10, implicit-def dead %eflags
+    ; CHECK: %3 = ADD32rr
+    ; CHECK-SAME: %10,
+    ; CHECK-SAME: %2,
+    %11 = SUB32ri8 %3, 10, implicit-def %eflags
+    JL_1 %bb.1.bb1, implicit %eflags
+    JMP_1 %bb.6.bb8
+  
+  bb.6.bb8:
+    %12 = MOV32r0 implicit-def dead %eflags
+    %eax = COPY %12
+    RET 0, %eax
+
+...
+---
+# Here a recurrence is formulated around %0, %11, and %3, but operands should
+# not be commuted because %0 has a use outside of recurrence. This is to
+# prevent the case of commuting operands ties the values with overlapping live
+# ranges. 
+name:            bar
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr32, preferred-register: '' }
+  - { id: 1, class: gr32, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr32, preferred-register: '' }
+  - { id: 13, class: gr32, preferred-register: '' }
+liveins:         
+  - { reg: '%edi', virtual-reg: '%4' }
+  - { reg: '%rsi', virtual-reg: '%5' }
+body:             |
+  bb.0.bb0:
+    successors: %bb.1.bb1(0x80000000)
+    liveins: %edi, %rsi
+  
+    %5 = COPY %rsi
+    %4 = COPY %edi
+    %6 = MOV32r0 implicit-def dead %eflags
+  
+  bb.1.bb1:
+    successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
+  
+    %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
+    ; CHECK: %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
+    %7 = MOV32ri 1
+    TEST32rr %4, %4, implicit-def %eflags
+    JE_1 %bb.3.bb4, implicit %eflags
+    JMP_1 %bb.2.bb3
+  
+  bb.2.bb3:
+    successors: %bb.3.bb4(0x80000000)
+  
+    %8 = MOV32ri 2
+  
+  bb.3.bb4:
+    successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
+  
+    %1 = PHI %7, %bb.1.bb1, %8, %bb.2.bb3
+    TEST32rr %1, %1, implicit-def %eflags
+    JE_1 %bb.5.bb7, implicit %eflags
+    JMP_1 %bb.4.bb6
+  
+  bb.4.bb6:
+    successors: %bb.5.bb7(0x80000000)
+  
+    %10 = MOV32ri 2
+  
+  bb.5.bb7:
+    successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
+  
+    %2 = PHI %7, %bb.3.bb4, %10, %bb.4.bb6
+    %11 = ADD32rr %1, %0, implicit-def dead %eflags
+    ; CHECK: %11 = ADD32rr
+    ; CHECK-SAME: %1,
+    ; CHECK-SAME: %0,
+    MOV32mr %5, 1, _, 0, _, %0 :: (store 4 into %ir.p)
+    %3 = ADD32rr %2, killed %11, implicit-def dead %eflags
+    ; CHECK: %3 = ADD32rr
+    ; CHECK-SAME: %2,
+    ; CHECK-SAME: %11,
+    %12 = SUB32ri8 %3, 10, implicit-def %eflags
+    JL_1 %bb.1.bb1, implicit %eflags
+    JMP_1 %bb.6.bb8
+  
+  bb.6.bb8:
+    %13 = MOV32r0 implicit-def dead %eflags
+    %eax = COPY %13
+    RET 0, %eax
+
+...