10 files changed, 331 insertions, 31 deletions
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index b690316d531..39e29640724 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -177,6 +177,12 @@ exit:
   ret i32 %sum
 }
 
+; Tail duplication during layout can entirely remove body0 by duplicating it
+; into the entry block and into body1. This is a good thing but it isn't what
+; this test is looking for. So to make the blocks longer so they don't get
+; duplicated, we add some calls to dummy.
+declare void @dummy()
+
 define i32 @test_loop_rotate(i32 %i, i32* %a) {
 ; Check that we rotate conditional exits from the loop to the bottom of the
 ; loop, eliminating unconditional branches to the top.
@@ -194,6 +200,8 @@ body0:
   %base = phi i32 [ 0, %entry ], [ %sum, %body1 ]
   %next = add i32 %iv, 1
   %exitcond = icmp eq i32 %next, %i
+  call void @dummy()
+  call void @dummy()
   br i1 %exitcond, label %exit, label %body1
 
 body1:
@@ -945,7 +953,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
 ; First rotated loop top.
 ; CHECK: .p2align
 ; CHECK: %while.end
-; CHECK: %for.cond
+; %for.cond gets completely tail-duplicated away.
 ; CHECK: %if.then
 ; CHECK: %if.else
 ; CHECK: %if.end10
diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
index c0c6fc4ac22..6e4762b2e79 100644
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -105,9 +105,11 @@ define i32 @weighted_select3(i32 %a, i32 %b) {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je [[LABEL_BB6:.*]]
 ; CHECK:         movl %edi, %eax
+; CHECK-NEXT:    retq
 ; CHECK:         [[LABEL_BB6]]
 ; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    jmp
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
 ;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
index 8d0318bb93e..78e7471b886 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: fmaddsubpd_loop_128:
 ; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -28,7 +28,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_128:
 ; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -54,7 +54,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_128:
 ; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -80,7 +80,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_128:
 ; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -106,7 +106,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_128:
 ; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -132,7 +132,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_128:
 ; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -329,7 +329,7 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
 
 ; CHECK-LABEL: fmaddsubpd_loop_256:
 ; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -355,7 +355,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_256:
 ; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -381,7 +381,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_256:
 ; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -407,7 +407,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_256:
 ; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -433,7 +433,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_256:
 ; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -459,7 +459,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_256:
 ; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
diff --git a/llvm/test/CodeGen/X86/fp-une-cmp.ll b/llvm/test/CodeGen/X86/fp-une-cmp.ll
index 653040053c2..e3b2a04060b 100644
--- a/llvm/test/CodeGen/X86/fp-une-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-une-cmp.ll
@@ -56,11 +56,11 @@ define double @profile_metadata(double %x, double %y) {
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
 ; CHECK-NEXT:    jp .LBB1_1
-; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:  # %bb2
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1: # %bb1
 ; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    retq
 
 entry:
   %mul = fmul double %x, %y
diff --git a/llvm/test/CodeGen/X86/pr11202.ll b/llvm/test/CodeGen/X86/pr11202.ll
index 13070d1c600..cb1a749d91f 100644
--- a/llvm/test/CodeGen/X86/pr11202.ll
+++ b/llvm/test/CodeGen/X86/pr11202.ll
@@ -15,5 +15,8 @@ l2:                                               ; preds = %l1
   br label %l1
 }
 
-; CHECK: .Ltmp0:                                 # Address of block that was removed by CodeGen
+; It is correct for either l1 or l2 to be removed.
+; If l2 is removed, the message should be "Address of block that was removed by CodeGen"
+; If l1 is removed, it should be "Block address taken."
+; CHECK: .Ltmp0:                                 # {{Address of block that was removed by CodeGen|Block address taken}}
 ; CHECK: .quad	.Ltmp0
diff --git a/llvm/test/CodeGen/X86/ragreedy-bug.ll b/llvm/test/CodeGen/X86/ragreedy-bug.ll
index e8426317f13..bfeb041f89a 100644
--- a/llvm/test/CodeGen/X86/ragreedy-bug.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-bug.ll
@@ -3,16 +3,34 @@
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure register copies are not generated on isupper.exit blocks.
 
-; CHECK: isupper.exit
+; isupper.exit and isupper.exit223 get tail-duplicated into all their
+; predecessors.
+; CHECK: cond.true.i.i
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: jne
-; CHECK: isupper.exit
+; CHECK: cond.true.i.i217
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: je
+; CHECK: cond.false.i.i
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: je
+; CHECK: cond.false.i.i219
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: jne
 
 %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
 %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 3ac6ea6e2b8..beedb1d2465 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -58,21 +58,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    je .LBB1_1
 ; X32-NEXT:  # BB#2: # %entry
 ; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    jmp .LBB1_3
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_5
+; X32-NEXT:    jmp .LBB1_4
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_3: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_4
-; X32-NEXT:  # BB#5: # %entry
+; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    jmp .LBB1_6
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:    jmp .LBB1_7
 ; X32-NEXT:  .LBB1_4:
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_6: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
-; X32-NEXT:  # BB#8: # %entry
+; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
 ; X32-NEXT:    jmp .LBB1_9
 ; X32-NEXT:  .LBB1_7:
@@ -95,21 +97,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    je .LBB1_1
 ; X64-NEXT:  # BB#2: # %entry
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    jne .LBB1_5
+; X64-NEXT:    jmp .LBB1_4
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_3: # %entry
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    je .LBB1_4
-; X64-NEXT:  # BB#5: # %entry
+; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    jmp .LBB1_6
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:    jmp .LBB1_7
 ; X64-NEXT:  .LBB1_4:
 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_6: # %entry
 ; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
-; X64-NEXT:  # BB#8: # %entry
+; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
 ; X64-NEXT:    jmp .LBB1_9
 ; X64-NEXT:  .LBB1_7:
diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
new file mode 100644
index 00000000000..2c8c05b6a91
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -0,0 +1,190 @@
+; RUN: llc -O2 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+; CHECK-LABEL: tail_dup_merge_loops
+; CHECK: # %entry
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_latch
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_test
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
+entry:
+  %notlhs674.i = icmp eq i32 %a, 0
+  br label %outer_loop_top
+
+outer_loop_top:                         ; preds = %inner_loop_exit, %entry
+  %dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]
+  br i1 %notlhs674.i, label %exit, label %inner_loop_preheader
+
+inner_loop_preheader:                           ; preds = %outer_loop_top
+  br label %inner_loop_top
+
+inner_loop_top:                                     ; preds = %inner_loop_latch, %inner_loop_preheader
+  %dst.0.i = phi i8* [ %inc, %inner_loop_latch ], [ %dst.0.ph.i, %inner_loop_preheader ]
+  %var = load i8, i8* %dst.0.i
+  %tobool1.i = icmp slt i8 %var, 0
+  br label %inner_loop_test
+
+inner_loop_test:                                       ; preds = %inner_loop_top
+  br i1 %tobool1.i, label %inner_loop_exit, label %inner_loop_latch
+
+inner_loop_exit:                       ; preds = %inner_loop_test
+  %scevgep.i = getelementptr i8, i8* %dst.0.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 0
+  br label %outer_loop_top
+
+inner_loop_latch:                                ; preds = %inner_loop_test
+  %cmp75.i = icmp ult i8* %dst.0.i, %c
+  %inc = getelementptr i8, i8* %dst.0.i, i64 2
+  br label %inner_loop_top
+
+exit:                              ; preds = %outer_loop_top
+  ret void
+}
+
+@.str.6 = external unnamed_addr constant [23 x i8], align 1
+
+; There is an erroneus check in LoopBase::addBasicBlockToLoop(), where it
+; assumes that the header block for a loop is unique.
+; For most of compilation this assumption is true, but during layout we allow
+; this assumption to be violated. The following code will trigger the bug:
+
+; The loops in question is eventually headed by the block shared_loop_header
+;
+; During layout The block labeled outer_loop_header gets tail-duplicated into
+; outer_loop_latch, and into shared_preheader, and then removed. This leaves
+; shared_loop_header as the header of both loops. The end result
+; is that there are 2 valid loops, and that they share a header. If we re-ran
+; the loop analysis, it would classify this as a single loop.
+; So far this is fine as far as layout is concerned.
+; After layout we tail merge blocks merge_other and merge_predecessor_split.
+; We do this even though they share only a single instruction, because
+; merge_predecessor_split falls through to their shared successor:
+; outer_loop_latch.
+; The rest of the blocks in the function are noise unfortunately. Bugpoint
+; couldn't shrink the test any further.
+
+; CHECK-LABEL: loop_shared_header
+; CHECK: # %entry
+; CHECK: # %shared_preheader
+; CHECK: # %shared_loop_header
+; CHECK: # %inner_loop_body
+; CHECK: # %merge_predecessor_split
+; CHECK: # %outer_loop_latch
+; CHECK: # %outer_loop_latch
+; CHECK: # %cleanup
+define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %mul = shl nsw i32 %0, 2
+  br i1 undef, label %if.end19, label %cleanup
+
+if.end19:                                         ; preds = %entry
+  %conv = zext i32 %mul to i64
+  %call = tail call i8* @cli_calloc(i64 %conv, i64 1)
+  %1 = icmp eq i32 %exesz, 0
+  %notrhs = icmp eq i32 %0, 0
+  %or.cond117.not = or i1 %1, %notrhs
+  %or.cond202 = or i1 %or.cond117.not, undef
+  %cmp35 = icmp ult i8* undef, %exe
+  %or.cond203 = or i1 %or.cond202, %cmp35
+  br i1 %or.cond203, label %cleanup, label %if.end50
+
+if.end50:                                         ; preds = %if.end19
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %call, i8* undef, i64 %conv, i32 1, i1 false)
+  %cmp1.i.i = icmp ugt i32 %mul, 3
+  br i1 %cmp1.i.i, label %shared_preheader, label %wunpsect.exit.thread.loopexit391
+
+shared_preheader:                                 ; preds = %if.end50
+  br label %outer_loop_header
+
+outer_loop_header:                                ; preds = %outer_loop_latch, %shared_preheader
+  %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ]
+  %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ]
+  %2 = icmp eq i32 undef, 0
+  br i1 %2, label %while.cond.us1412.i, label %shared_loop_header
+
+while.cond.us1412.i:                              ; preds = %outer_loop_header
+  %.pre.i = add i8 %bits.1.i, -1
+  %tobool2.us1420.i = icmp eq i8 %.pre.i, 0
+  %or.cond.us1421.i = or i1 undef, %tobool2.us1420.i
+  br i1 %or.cond.us1421.i, label %if.end41.us1436.i, label %cleanup
+
+if.end41.us1436.i:                                ; preds = %while.cond.us1412.i
+  unreachable
+
+shared_loop_header:                               ; preds = %dup_early2, %dup_early1
+  %dst.0.i = phi i8* [ undef, %inner_loop_body ], [ %dst.0.ph.i, %outer_loop_header ], [ undef, %dead_block ]
+  %cmp3.i1172.i = icmp ult i8* undef, %call
+  br i1 %cmp3.i1172.i, label %wunpsect.exit.thread.loopexit389, label %inner_loop_body
+
+inner_loop_body:                                  ; preds = %shared_loop_header
+  %3 = icmp slt i32 undef, 0
+  br i1 %3, label %if.end96.i, label %shared_loop_header
+
+dead_block:                                       ; preds = %inner_loop_body
+  %cmp75.i = icmp ult i8* %dst.0.i, undef
+  br label %shared_loop_header
+
+if.end96.i:                                       ; preds = %inner_loop_body
+  %cmp97.i = icmp ugt i32 undef, 2
+  br i1 %cmp97.i, label %if.then99.i, label %if.end287.i
+
+if.then99.i:                                      ; preds = %if.end96.i
+  tail call void (i8*, ...) @cli_dbgmsg(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.6, i64 0, i64 0), i32 undef)
+  br label %cleanup
+
+if.end287.i:                                      ; preds = %if.end96.i
+  %cmp291.i = icmp ne i32 undef, 1
+  %conv294.i = select i1 %cmp291.i, i16 4, i16 3
+  br i1 undef, label %if.end308.i, label %outer_loop_latch
+
+if.end308.i:                                      ; preds = %if.end287.i
+  br i1 undef, label %if.end335.i, label %merge_predecessor_split
+
+merge_predecessor_split:                          ; preds = %if.end308.i
+  %4 = bitcast i8* undef to i32*
+  br label %outer_loop_latch
+
+if.end335.i:                                      ; preds = %if.end308.i
+  br i1 undef, label %outer_loop_latch, label %merge_other
+
+merge_other:                                      ; preds = %if.end335.i
+  br label %outer_loop_latch
+
+outer_loop_latch:                                 ; preds = %merge_other, %if.end335.i, %merge_predecessor_split, %if.end287.i
+  %bits.43.i = phi i8 [ undef, %if.end287.i ], [ undef, %merge_other ], [ 32, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %backsize.0.i = phi i16 [ %conv294.i, %if.end287.i ], [ 0, %merge_other ], [ 0, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %5 = add i16 %backsize.0.i, -1
+  %6 = zext i16 %5 to i64
+  %scevgep.i = getelementptr i8, i8* %dst.0.ph.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 %6
+  br label %outer_loop_header
+
+wunpsect.exit.thread.loopexit389:                 ; preds = %shared_loop_header
+  unreachable
+
+wunpsect.exit.thread.loopexit391:                 ; preds = %if.end50
+  unreachable
+
+cleanup:                                          ; preds = %if.then99.i, %while.cond.us1412.i, %if.end19, %entry
+  %retval.0 = phi i32 [ 0, %if.then99.i ], [ 1, %entry ], [ 1, %if.end19 ], [ 1, %while.cond.us1412.i ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @cli_dbgmsg(i8*, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i8* @cli_calloc(i64, i64) local_unnamed_addr #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/tail-dup-repeat.ll b/llvm/test/CodeGen/X86/tail-dup-repeat.ll
new file mode 100644
index 00000000000..21b48e16efb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-repeat.ll
@@ -0,0 +1,53 @@
+; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: uwtable
+; When tail-duplicating during placement, we work backward from blocks with
+; multiple successors. In this case, the block dup1 gets duplicated into dup2
+; and if.then64, and then the block dup2 gets duplicated into land.lhs.true
+; and if.end70
+; CHECK-LABEL: repeated_tail_dup:
+define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6) #0 align 2 {
+entry:
+  br label %for.cond
+
+; CHECK: {{^}}.[[HEADER:LBB0_[1-9]]]: # %for.cond
+for.cond:                                         ; preds = %dup1, %entry
+  br i1 %a1, label %land.lhs.true, label %if.end56
+
+land.lhs.true:                                    ; preds = %for.cond
+  store i32 10, i32* %a4, align 8
+  br label %dup2
+
+if.end56:                                         ; preds = %for.cond
+  br i1 %a2, label %if.then64, label %if.end70
+
+if.then64:                                        ; preds = %if.end56
+  store i8 1, i8* %a6, align 1
+  br label %dup1
+
+; CHECK:      # %if.end70
+; CHECK-NEXT: # in Loop:
+; CHECK-NEXT: movl $12, (%rdx)
+; CHECK-NEXT: movl $2, (%rcx)
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: je .[[HEADER]]
+if.end70:                                         ; preds = %if.end56
+  store i32 12, i32* %a4, align 8
+  br label %dup2
+
+dup2:                                             ; preds = %if.end70, %land.lhs.true
+  store i32 2, i32* %a5, align 4
+  br label %dup1
+
+dup1:                                             ; preds = %dup2, %if.then64
+  %val = load i32, i32* %a4, align 8
+  %switch = icmp ult i32 undef, 1
+  br i1 %switch, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %dup1
+  ret void
+}
+
+attributes #0 = { uwtable }
diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir
index 1e75c6af9eb..2e8e85b4ef6 100644
--- a/llvm/test/CodeGen/X86/update-terminator.mir
+++ b/llvm/test/CodeGen/X86/update-terminator.mir
@@ -5,17 +5,30 @@
   @a = external global i16
   @b = external global i32
 
+  declare void @dummy1()
+  declare void @dummy2()
+  declare void @dummy3()
+
   ; Function Attrs: nounwind
   define void @f2() {
     br i1 undef, label %bb1, label %bb3
 
   bb1:
+    call void @dummy1()
+    call void @dummy1()
+    call void @dummy1()
     br i1 undef, label %bb2, label %bb2
 
   bb2:
+    call void @dummy2()
+    call void @dummy2()
+    call void @dummy2()
     br label %bb4
 
   bb3:
+    call void @dummy3()
+    call void @dummy3()
+    call void @dummy3()
     br label %bb2
 
   bb4:
@@ -40,15 +53,24 @@ body:             |
   bb.1:
     successors: %bb.2(100)
 
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
     JNE_1 %bb.2, implicit %eflags
 
   bb.2:
     successors: %bb.4(100)
 
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.4
 
   bb.3:
     successors: %bb.2(100)
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.2
 
   bb.4: