3 files changed, 116 insertions, 10 deletions
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 1a783b61276..a2d304bb078 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1005,6 +1005,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
     MachineBasicBlock *IBB = &*I;
     MachineBasicBlock *PredBB = &*std::prev(I);
     MergePotentials.clear();
+    MachineLoop *ML;
+
+    // Bail if merging after placement and IBB is the loop header because
+    // -- If merging predecessors that belong to the same loop as IBB, the
+    // common tail of merged predecessors may become the loop top if block
+    // placement is called again and the predecessors may branch to this common
+    // tail and require more branches. This can be relaxed if
+    // MachineBlockPlacement::findBestLoopTop is more flexible.
+    // --If merging predecessors that do not belong to the same loop as IBB, the
+    // loop info of IBB's loop and the other loops may be affected. Calling the
+    // block placement again may make big change to the layout and eliminate the
+    // reason to do tail merging here.
+    if (AfterBlockPlacement && MLI) {
+      ML = MLI->getLoopFor(IBB);
+      if (ML && IBB == ML->getHeader())
+        continue;
+    }
+
     for (MachineBasicBlock *PBB : I->predecessors()) {
       if (MergePotentials.size() == TailMergeThreshold)
         break;
@@ -1024,16 +1042,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (PBB->hasEHPadSuccessor())
         continue;
 
-      // Bail out if the loop header (IBB) is not the top of the loop chain
-      // after the block placement.  Otherwise, the common tail of IBB's
-      // predecessors may become the loop top if block placement is called again
-      // and the predecessors may branch to this common tail.
-      // FIXME: Relaxed this check if the algorithm of finding loop top is
-      // changed in MBP.
+      // After block placement, only consider predecessors that belong to the
+      // same loop as IBB.  The reason is the same as above when skipping loop
+      // header.
       if (AfterBlockPlacement && MLI)
-        if (MachineLoop *ML = MLI->getLoopFor(IBB))
-          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
-            continue;
+        if (ML != MLI->getLoopFor(PBB))
+          continue;
 
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
       SmallVector<MachineOperand, 4> Cond;
diff --git a/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll b/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 151cc1b12ed..04eae8f9afe 100644
--- a/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -49,7 +49,7 @@ tailrecurse.switch:                               ; preds = %tailrecurse
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: beq
+; V8-NEXT: bne
 ; V8-NEXT: b	
 ; The trailing space in the last line checks that the branch is unconditional
   switch i32 %and, label %sw.epilog [
diff --git a/llvm/test/CodeGen/X86/tail-merge-after-mbp.ll b/llvm/test/CodeGen/X86/tail-merge-after-mbp.ll
new file mode 100644
index 00000000000..7a8d2e8000e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-merge-after-mbp.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=x86_64-linux -o - %s | FileCheck %s
+
+%0 = type { %1, %3* }
+%1 = type { %2* }
+%2 = type { %2*, i8* }
+%3 = type { i32, i32 (i32, i32)* }
+
+
+declare i32 @Up(...) 
+declare i32 @f(i32, i32) 
+
+; check loop block BB#10 is not merged with LBB0_12
+; check loop block LBB0_9 is not merged with BB#11, BB#13
+define i32 @foo(%0* nocapture readonly, i32, i1 %c, i8* %p1, %2** %p2) {
+; CHECK-LABEL: foo:
+; CHECK:     LBB0_9:
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je      
+; CHECK-NEXT:# BB#10:
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne    
+; CHECK-NEXT:# BB#11:
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je    
+; CHECK-NEXT:LBB0_12:
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne  
+; CHECK-NEXT:# BB#13:
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        jne 
+  br i1 %c, label %34, label %3
+
+; <label>:3:                                      ; preds = %2
+  br i1 %c, label %7, label %4
+
+; <label>:4:                                      ; preds = %3
+  %5 = tail call i32 @f(i32 undef, i32 undef)
+  %6 = icmp eq i32 %5, 0
+  br i1 %6, label %7, label %34
+
+; <label>:7:                                      ; preds = %4, %3
+  %8 = icmp eq %2* null, null
+  br i1 %8, label %34, label %9
+
+; <label>:9:                                      ; preds = %7
+  %10 = icmp eq i8* %p1, null
+  br i1 %10, label %11, label %32
+
+; <label>:11:                                     ; preds = %9
+  %12 = load %2*, %2** %p2, align 8
+  %13 = icmp eq %2* %12, null
+  br i1 %13, label %34, label %14
+
+; <label>:14:                                     ; preds = %11
+  %15 = getelementptr inbounds %2, %2* %12, i64 0, i32 1
+  %16 = load i8*, i8** %15, align 8
+  %17 = icmp eq i8* %16, null
+  br i1 %17, label %18, label %32
+
+; <label>:18:                                     ; preds = %14
+  %19 = load %2*, %2** %p2, align 8
+  %20 = icmp eq %2* %19, null
+  br i1 %20, label %34, label %21
+
+; <label>:21:                                     ; preds = %18
+  %22 = getelementptr inbounds %2, %2* %19, i64 0, i32 1
+  %23 = load i8*, i8** %22, align 8
+  %24 = icmp eq i8* %23, null
+  br i1 %24, label %25, label %32
+
+; <label>:25:                                     ; preds = %28, %21
+  %26 = load %2*, %2** %p2, align 8
+  %27 = icmp eq %2* %26, null
+  br i1 %27, label %34, label %28
+
+; <label>:28:                                     ; preds = %25
+  %29 = getelementptr inbounds %2, %2* %26, i64 0, i32 1
+  %30 = load i8*, i8** %29, align 8
+  %31 = icmp eq i8* %30, null
+  br i1 %31, label %25, label %32
+
+; <label>:32:                                     ; preds = %28, %21, %14, %9
+  %33 = tail call i32 (...) @Up()
+  br label %34
+
+; <label>:34:                                     ; preds = %32, %25, %18, %11, %7, %4, %2
+  %35 = phi i32 [ 0, %2 ], [ %5, %4 ], [ 0, %7 ], [ 0, %11 ], [ 0, %32 ], [ 0, %18 ], [ 0, %25 ]
+  ret i32 %35
+}