Codegen: Tail-duplicate during placement.

The tail duplication pass uses an assumed layout when making duplication decisions. This is fine, but passes up duplication opportunities that may arise when blocks are outlined. Because we want the updated CFG to affect subsequent placement decisions, this change must occur during placement. In order to achieve this goal, TailDuplicationPass is split into a utility class, TailDuplicator, and the pass itself. The pass delegates nearly everything to the TailDuplicator object, except for looping over the blocks in a function. This allows the same code to be used for tail duplication in both places. This change, in concert with outlining optional branches, allows triangle shaped code to perform much better, esepecially when the taken/untaken branches are correlated, as it creates a second spine when the tests are small enough. Issue from previous rollback fixed, and a new test was added for that case as well. Issue was worklist/scheduling/taildup issue in layout. Issue from 2nd rollback fixed, with 2 additional tests. Issue was tail merging/loop info/tail-duplication causing issue with loops that share a header block. Differential revision: https://reviews.llvm.org/D18226 llvm-svn: 283619
author: Kyle Butt <kyle+llvm@iteratee.net> 2016-10-07 22:33:20 +0000
committer: Kyle Butt <kyle+llvm@iteratee.net> 2016-10-07 22:33:20 +0000
commit: 37e676d85762d8541e2df16bfa14963f5c705118 (patch)
tree: 229b8aee0fa9fd60d44e45d86a1bc073eedb81ec /llvm/test/CodeGen
parent: 609e669e1afd91a00260aad5f4ba5230e75161e3 (diff)
download: bcm5719-llvm-37e676d85762d8541e2df16bfa14963f5c705118.tar.gz
bcm5719-llvm-37e676d85762d8541e2df16bfa14963f5c705118.zip
19 files changed, 530 insertions, 47 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll
index 92ce2a04589..5dd8cb28232 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll
@@ -12,7 +12,6 @@ bb1:
   %tmp2 = load i16, i16* %ptr, align 2
   br label %bb2
 bb2:
-; CHECK: %bb2
 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
 ; CHECK: cmp [[REG]], #23
   %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
diff --git a/llvm/test/CodeGen/AArch64/machine_cse.ll b/llvm/test/CodeGen/AArch64/machine_cse.ll
index 032199e6218..e9fa68041d9 100644
--- a/llvm/test/CodeGen/AArch64/machine_cse.ll
+++ b/llvm/test/CodeGen/AArch64/machine_cse.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s
+; -tail-dup-placement causes tail duplication during layout. This breaks the
+; assumptions of the test case as written (specifically, it creates an
+; additional cmp instruction, creating a false positive), so we pass
+; -tail-dup-placement=0 to restore the original behavior
 
 ; marked as external to prevent possible optimizations
 @a = external global i32
diff --git a/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll b/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
new file mode 100644
index 00000000000..c2997c50f4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
@@ -0,0 +1,69 @@
+; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+%struct.s1 = type { %struct.s3*, %struct.s1* }
+%struct.s2 = type opaque
+%struct.s3 = type { i32 }
+
+; Function Attrs: nounwind
+define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 {
+entry:
+  br label %while.cond.outer
+
+; The loop gets laid out:
+; %while.cond.outer
+; %(null)
+; %(null)
+; %dup2
+; and then %dup1 gets chosen as the next block.
+; when dup2 is duplicated into dup1, %worklist could erroneously be placed on
+; the worklist, because all of its current predecessors are now scheduled.
+; However, after dup2 is tail-duplicated, %worklist can't be on the worklist
+; because it now has unscheduled predecessors.q
+; CHECK-LABEL: repeated_dup_worklist
+; CHECK: // %entry
+; CHECK: // %while.cond.outer
+; first %(null) block
+; CHECK: // in Loop:
+; CHECK: ldr
+; CHECK-NEXT: tbnz
+; second %(null) block
+; CHECK: // in Loop:
+; CHECK: // %dup2
+; CHECK: // %worklist
+; CHECK: // %if.then96.i
+while.cond.outer:                                 ; preds = %dup1, %entry
+  %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ]
+  %inc77 = add nsw i32 %progress.0.ph, 1
+  %cmp = icmp slt i32 %progress.0.ph, %i32_1
+  br i1 %cmp, label %dup2, label %dup1
+
+dup2:                       ; preds = %if.then96.i, %worklist, %while.cond.outer
+  %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ]
+  %.pr = load %struct.s1*, %struct.s1** %pp1, align 8
+  br label %dup1
+
+dup1:                                       ; preds = %dup2, %while.cond.outer
+  %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ]
+  %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ]
+  br i1 %i1_1, label %while.cond.outer, label %worklist
+
+worklist:                                       ; preds = %dup1
+  %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0
+  %1 = load %struct.s3*, %struct.s3** %snode94, align 8
+  %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0
+  %3 = load i32, i32* %2, align 4
+  %tobool95.i = icmp eq i32 %3, 0
+  br i1 %tobool95.i, label %if.then96.i, label %dup2
+
+if.then96.i:                                      ; preds = %worklist
+  call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1
+  br label %dup2
+}
+
+; Function Attrs: nounwind
+declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll b/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
index 15b7e9410fc..83c7676e57e 100644
--- a/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
+++ b/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
@@ -25,7 +25,6 @@ bb1:                                              ; preds = %bb
   br label %bb2
 
 bb2:                                              ; preds = %bb1, %entry
-; CHECK: bb2
 ; CHECK: cmp [[REG]], #0
 ; CHECK: ble
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
diff --git a/llvm/test/CodeGen/PowerPC/branch-opt.ll b/llvm/test/CodeGen/PowerPC/branch-opt.ll
index b3c0dba8b85..e714972b17e 100644
--- a/llvm/test/CodeGen/PowerPC/branch-opt.ll
+++ b/llvm/test/CodeGen/PowerPC/branch-opt.ll
@@ -1,9 +1,21 @@
-; RUN: llc -verify-machineinstrs < %s -march=ppc32 | \
-; RUN:   grep "b LBB.*" | count 4
+; RUN: llc -verify-machineinstrs < %s -march=ppc32 | FileCheck %s
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.7.0"
 
+;CHECK-LABEL: foo:
+; There are 4 inner loops (%bb, %bb12, %bb25, %bb38) that all exit to %cond_next48
+; The last (whichever it is) should have a fallthrough exit, and the other three
+; need an unconditional branch. No other block should have an unconditional
+; branch to cond_next48
+; One of the blocks ends up with a loop exit block that gets a tail-duplicated copy
+; of %cond_next48, so there should only be two unconditional branches.
+
+;CHECK: b LBB0_13
+;CHECK: b LBB0_13
+;CHECK-NOT: b LBB0_13
+;CHECK: LBB0_13: ; %cond_next48
+
 define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) {
 entry:
 	%tmp1 = and i32 %W, 1		; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/PowerPC/sjlj.ll b/llvm/test/CodeGen/PowerPC/sjlj.ll
index 7fe31384675..f86f5345f87 100644
--- a/llvm/test/CodeGen/PowerPC/sjlj.ll
+++ b/llvm/test/CodeGen/PowerPC/sjlj.ll
@@ -74,24 +74,24 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
 ; CHECK-DAG: std 1, 16([[REGA]])
 ; CHECK-DAG: std 2, 24([[REGA]])
-; CHECK: bcl 20, 31, .LBB1_5
+; CHECK: bcl 20, 31, .LBB1_3
 ; CHECK: li 3, 1
-; CHECK: #EH_SjLj_Setup	.LBB1_5
+; CHECK: #EH_SjLj_Setup	.LBB1_3
 ; CHECK: b .LBB1_1
 
-; CHECK: .LBB1_4:
+; CHECK: .LBB1_3:
+; CHECK: mflr [[REGL:[0-9]+]]
+; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
+; CHECK: std [[REGL]], 8([[REG2]])
+; CHECK: li 3, 0
+
+; CHECK: .LBB1_5:
 
 ; CHECK: lfd
 ; CHECK: lxvd2x
 ; CHECK: ld
 ; CHECK: blr
 
-; CHECK: .LBB1_5:
-; CHECK: mflr [[REGL:[0-9]+]]
-; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
-; CHECK: std [[REGL]], 8([[REG2]])
-; CHECK: li 3, 0
-
 ; CHECK-NOAV: @main
 ; CHECK-NOAV-NOT: stxvd2x
 ; CHECK-NOAV: bcl
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
new file mode 100644
index 00000000000..6790aa8e944
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -0,0 +1,100 @@
+; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Intended layout:
+; The outlining flag produces the layout
+; test1
+; test2
+; test3
+; test4
+; exit
+; optional1
+; optional2
+; optional3
+; optional4
+; Tail duplication puts test n+1 at the end of optional n
+; so optional1 includes a copy of test2 at the end, and branches
+; to test3 (at the top) or falls through to optional 2.
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: f:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[TEST3LABEL]]
+;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, [[TEST4LABEL]]
+;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, [[EXITLABEL]]
+;CHECK-NEXT: [[OPT4LABEL]]
+;CHECK: b [[EXITLABEL]]
+
+define void @f(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %exit, label %optional4
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll
index eb1f2368344..fcdbd7fdc57 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 
diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
index 71787feb77d..0ac1e1e182c 100644
--- a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index b690316d531..39e29640724 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -177,6 +177,12 @@ exit:
   ret i32 %sum
 }
 
+; Tail duplication during layout can entirely remove body0 by duplicating it
+; into the entry block and into body1. This is a good thing but it isn't what
+; this test is looking for. So to make the blocks longer so they don't get
+; duplicated, we add some calls to dummy.
+declare void @dummy()
+
 define i32 @test_loop_rotate(i32 %i, i32* %a) {
 ; Check that we rotate conditional exits from the loop to the bottom of the
 ; loop, eliminating unconditional branches to the top.
@@ -194,6 +200,8 @@ body0:
   %base = phi i32 [ 0, %entry ], [ %sum, %body1 ]
   %next = add i32 %iv, 1
   %exitcond = icmp eq i32 %next, %i
+  call void @dummy()
+  call void @dummy()
   br i1 %exitcond, label %exit, label %body1
 
 body1:
@@ -945,7 +953,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
 ; First rotated loop top.
 ; CHECK: .p2align
 ; CHECK: %while.end
-; CHECK: %for.cond
+; %for.cond gets completely tail-duplicated away.
 ; CHECK: %if.then
 ; CHECK: %if.else
 ; CHECK: %if.end10
diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
index c0c6fc4ac22..6e4762b2e79 100644
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -105,9 +105,11 @@ define i32 @weighted_select3(i32 %a, i32 %b) {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je [[LABEL_BB6:.*]]
 ; CHECK:         movl %edi, %eax
+; CHECK-NEXT:    retq
 ; CHECK:         [[LABEL_BB6]]
 ; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    jmp
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
 ;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
index 8d0318bb93e..78e7471b886 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: fmaddsubpd_loop_128:
 ; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -28,7 +28,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_128:
 ; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -54,7 +54,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_128:
 ; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -80,7 +80,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_128:
 ; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -106,7 +106,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_128:
 ; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -132,7 +132,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_128:
 ; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
-; CHECK:   vmovaps %xmm2, %xmm0
+; CHECK:   vmovapd %xmm2, %xmm0
 ; CHECK-NEXT: retq
 define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
 entry:
@@ -329,7 +329,7 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
 
 ; CHECK-LABEL: fmaddsubpd_loop_256:
 ; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -355,7 +355,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubaddpd_loop_256:
 ; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -381,7 +381,7 @@ for.end:
 
 ; CHECK-LABEL: fmaddpd_loop_256:
 ; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -407,7 +407,7 @@ for.end:
 
 ; CHECK-LABEL: fmsubpd_loop_256:
 ; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -433,7 +433,7 @@ for.end:
 
 ; CHECK-LABEL: fnmaddpd_loop_256:
 ; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
@@ -459,7 +459,7 @@ for.end:
 
 ; CHECK-LABEL: fnmsubpd_loop_256:
 ; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
-; CHECK:   vmovaps %ymm2, %ymm0
+; CHECK:   vmovapd %ymm2, %ymm0
 ; CHECK-NEXT: retq
 define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
 entry:
diff --git a/llvm/test/CodeGen/X86/fp-une-cmp.ll b/llvm/test/CodeGen/X86/fp-une-cmp.ll
index 653040053c2..e3b2a04060b 100644
--- a/llvm/test/CodeGen/X86/fp-une-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-une-cmp.ll
@@ -56,11 +56,11 @@ define double @profile_metadata(double %x, double %y) {
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
 ; CHECK-NEXT:    jp .LBB1_1
-; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:  # %bb2
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1: # %bb1
 ; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    retq
 
 entry:
   %mul = fmul double %x, %y
diff --git a/llvm/test/CodeGen/X86/pr11202.ll b/llvm/test/CodeGen/X86/pr11202.ll
index 13070d1c600..cb1a749d91f 100644
--- a/llvm/test/CodeGen/X86/pr11202.ll
+++ b/llvm/test/CodeGen/X86/pr11202.ll
@@ -15,5 +15,8 @@ l2:                                               ; preds = %l1
   br label %l1
 }
 
-; CHECK: .Ltmp0:                                 # Address of block that was removed by CodeGen
+; It is correct for either l1 or l2 to be removed.
+; If l2 is removed, the message should be "Address of block that was removed by CodeGen"
+; If l1 is removed, it should be "Block address taken."
+; CHECK: .Ltmp0:                                 # {{Address of block that was removed by CodeGen|Block address taken}}
 ; CHECK: .quad	.Ltmp0
diff --git a/llvm/test/CodeGen/X86/ragreedy-bug.ll b/llvm/test/CodeGen/X86/ragreedy-bug.ll
index e8426317f13..bfeb041f89a 100644
--- a/llvm/test/CodeGen/X86/ragreedy-bug.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-bug.ll
@@ -3,16 +3,34 @@
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure register copies are not generated on isupper.exit blocks.
 
-; CHECK: isupper.exit
+; isupper.exit and isupper.exit223 get tail-duplicated into all their
+; predecessors.
+; CHECK: cond.true.i.i
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: jne
-; CHECK: isupper.exit
+; CHECK: cond.true.i.i217
 ; CHECK-NEXT: in Loop
+; Mem-move
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: je
+; CHECK: cond.false.i.i
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: je
+; CHECK: cond.false.i.i219
 ; CHECK: maskrune
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: testl
+; CHECK-NEXT: jne
 
 %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
 %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 3ac6ea6e2b8..beedb1d2465 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -58,21 +58,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    je .LBB1_1
 ; X32-NEXT:  # BB#2: # %entry
 ; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    jmp .LBB1_3
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_5
+; X32-NEXT:    jmp .LBB1_4
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_3: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_4
-; X32-NEXT:  # BB#5: # %entry
+; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    jmp .LBB1_6
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:    jmp .LBB1_7
 ; X32-NEXT:  .LBB1_4:
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_6: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
-; X32-NEXT:  # BB#8: # %entry
+; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
 ; X32-NEXT:    jmp .LBB1_9
 ; X32-NEXT:  .LBB1_7:
@@ -95,21 +97,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    je .LBB1_1
 ; X64-NEXT:  # BB#2: # %entry
 ; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    jne .LBB1_5
+; X64-NEXT:    jmp .LBB1_4
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_3: # %entry
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    je .LBB1_4
-; X64-NEXT:  # BB#5: # %entry
+; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    jmp .LBB1_6
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:    jmp .LBB1_7
 ; X64-NEXT:  .LBB1_4:
 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:  .LBB1_6: # %entry
 ; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
-; X64-NEXT:  # BB#8: # %entry
+; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
 ; X64-NEXT:    jmp .LBB1_9
 ; X64-NEXT:  .LBB1_7:
diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
new file mode 100644
index 00000000000..2c8c05b6a91
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -0,0 +1,190 @@
+; RUN: llc -O2 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+; CHECK-LABEL: tail_dup_merge_loops
+; CHECK: # %entry
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_latch
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %inner_loop_test
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
+entry:
+  %notlhs674.i = icmp eq i32 %a, 0
+  br label %outer_loop_top
+
+outer_loop_top:                         ; preds = %inner_loop_exit, %entry
+  %dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]
+  br i1 %notlhs674.i, label %exit, label %inner_loop_preheader
+
+inner_loop_preheader:                           ; preds = %outer_loop_top
+  br label %inner_loop_top
+
+inner_loop_top:                                     ; preds = %inner_loop_latch, %inner_loop_preheader
+  %dst.0.i = phi i8* [ %inc, %inner_loop_latch ], [ %dst.0.ph.i, %inner_loop_preheader ]
+  %var = load i8, i8* %dst.0.i
+  %tobool1.i = icmp slt i8 %var, 0
+  br label %inner_loop_test
+
+inner_loop_test:                                       ; preds = %inner_loop_top
+  br i1 %tobool1.i, label %inner_loop_exit, label %inner_loop_latch
+
+inner_loop_exit:                       ; preds = %inner_loop_test
+  %scevgep.i = getelementptr i8, i8* %dst.0.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 0
+  br label %outer_loop_top
+
+inner_loop_latch:                                ; preds = %inner_loop_test
+  %cmp75.i = icmp ult i8* %dst.0.i, %c
+  %inc = getelementptr i8, i8* %dst.0.i, i64 2
+  br label %inner_loop_top
+
+exit:                              ; preds = %outer_loop_top
+  ret void
+}
+
+@.str.6 = external unnamed_addr constant [23 x i8], align 1
+
+; There is an erroneus check in LoopBase::addBasicBlockToLoop(), where it
+; assumes that the header block for a loop is unique.
+; For most of compilation this assumption is true, but during layout we allow
+; this assumption to be violated. The following code will trigger the bug:
+
+; The loops in question is eventually headed by the block shared_loop_header
+;
+; During layout The block labeled outer_loop_header gets tail-duplicated into
+; outer_loop_latch, and into shared_preheader, and then removed. This leaves
+; shared_loop_header as the header of both loops. The end result
+; is that there are 2 valid loops, and that they share a header. If we re-ran
+; the loop analysis, it would classify this as a single loop.
+; So far this is fine as far as layout is concerned.
+; After layout we tail merge blocks merge_other and merge_predecessor_split.
+; We do this even though they share only a single instruction, because
+; merge_predecessor_split falls through to their shared successor:
+; outer_loop_latch.
+; The rest of the blocks in the function are noise unfortunately. Bugpoint
+; couldn't shrink the test any further.
+
+; CHECK-LABEL: loop_shared_header
+; CHECK: # %entry
+; CHECK: # %shared_preheader
+; CHECK: # %shared_loop_header
+; CHECK: # %inner_loop_body
+; CHECK: # %merge_predecessor_split
+; CHECK: # %outer_loop_latch
+; CHECK: # %outer_loop_latch
+; CHECK: # %cleanup
+define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %mul = shl nsw i32 %0, 2
+  br i1 undef, label %if.end19, label %cleanup
+
+if.end19:                                         ; preds = %entry
+  %conv = zext i32 %mul to i64
+  %call = tail call i8* @cli_calloc(i64 %conv, i64 1)
+  %1 = icmp eq i32 %exesz, 0
+  %notrhs = icmp eq i32 %0, 0
+  %or.cond117.not = or i1 %1, %notrhs
+  %or.cond202 = or i1 %or.cond117.not, undef
+  %cmp35 = icmp ult i8* undef, %exe
+  %or.cond203 = or i1 %or.cond202, %cmp35
+  br i1 %or.cond203, label %cleanup, label %if.end50
+
+if.end50:                                         ; preds = %if.end19
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %call, i8* undef, i64 %conv, i32 1, i1 false)
+  %cmp1.i.i = icmp ugt i32 %mul, 3
+  br i1 %cmp1.i.i, label %shared_preheader, label %wunpsect.exit.thread.loopexit391
+
+shared_preheader:                                 ; preds = %if.end50
+  br label %outer_loop_header
+
+outer_loop_header:                                ; preds = %outer_loop_latch, %shared_preheader
+  %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ]
+  %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ]
+  %2 = icmp eq i32 undef, 0
+  br i1 %2, label %while.cond.us1412.i, label %shared_loop_header
+
+while.cond.us1412.i:                              ; preds = %outer_loop_header
+  %.pre.i = add i8 %bits.1.i, -1
+  %tobool2.us1420.i = icmp eq i8 %.pre.i, 0
+  %or.cond.us1421.i = or i1 undef, %tobool2.us1420.i
+  br i1 %or.cond.us1421.i, label %if.end41.us1436.i, label %cleanup
+
+if.end41.us1436.i:                                ; preds = %while.cond.us1412.i
+  unreachable
+
+shared_loop_header:                               ; preds = %dup_early2, %dup_early1
+  %dst.0.i = phi i8* [ undef, %inner_loop_body ], [ %dst.0.ph.i, %outer_loop_header ], [ undef, %dead_block ]
+  %cmp3.i1172.i = icmp ult i8* undef, %call
+  br i1 %cmp3.i1172.i, label %wunpsect.exit.thread.loopexit389, label %inner_loop_body
+
+inner_loop_body:                                  ; preds = %shared_loop_header
+  %3 = icmp slt i32 undef, 0
+  br i1 %3, label %if.end96.i, label %shared_loop_header
+
+dead_block:                                       ; preds = %inner_loop_body
+  %cmp75.i = icmp ult i8* %dst.0.i, undef
+  br label %shared_loop_header
+
+if.end96.i:                                       ; preds = %inner_loop_body
+  %cmp97.i = icmp ugt i32 undef, 2
+  br i1 %cmp97.i, label %if.then99.i, label %if.end287.i
+
+if.then99.i:                                      ; preds = %if.end96.i
+  tail call void (i8*, ...) @cli_dbgmsg(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.6, i64 0, i64 0), i32 undef)
+  br label %cleanup
+
+if.end287.i:                                      ; preds = %if.end96.i
+  %cmp291.i = icmp ne i32 undef, 1
+  %conv294.i = select i1 %cmp291.i, i16 4, i16 3
+  br i1 undef, label %if.end308.i, label %outer_loop_latch
+
+if.end308.i:                                      ; preds = %if.end287.i
+  br i1 undef, label %if.end335.i, label %merge_predecessor_split
+
+merge_predecessor_split:                          ; preds = %if.end308.i
+  %4 = bitcast i8* undef to i32*
+  br label %outer_loop_latch
+
+if.end335.i:                                      ; preds = %if.end308.i
+  br i1 undef, label %outer_loop_latch, label %merge_other
+
+merge_other:                                      ; preds = %if.end335.i
+  br label %outer_loop_latch
+
+outer_loop_latch:                                 ; preds = %merge_other, %if.end335.i, %merge_predecessor_split, %if.end287.i
+  %bits.43.i = phi i8 [ undef, %if.end287.i ], [ undef, %merge_other ], [ 32, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %backsize.0.i = phi i16 [ %conv294.i, %if.end287.i ], [ 0, %merge_other ], [ 0, %merge_predecessor_split ], [ 0, %if.end335.i ]
+  %5 = add i16 %backsize.0.i, -1
+  %6 = zext i16 %5 to i64
+  %scevgep.i = getelementptr i8, i8* %dst.0.ph.i, i64 1
+  %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 %6
+  br label %outer_loop_header
+
+wunpsect.exit.thread.loopexit389:                 ; preds = %shared_loop_header
+  unreachable
+
+wunpsect.exit.thread.loopexit391:                 ; preds = %if.end50
+  unreachable
+
+cleanup:                                          ; preds = %if.then99.i, %while.cond.us1412.i, %if.end19, %entry
+  %retval.0 = phi i32 [ 0, %if.then99.i ], [ 1, %entry ], [ 1, %if.end19 ], [ 1, %while.cond.us1412.i ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @cli_dbgmsg(i8*, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i8* @cli_calloc(i64, i64) local_unnamed_addr #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/tail-dup-repeat.ll b/llvm/test/CodeGen/X86/tail-dup-repeat.ll
new file mode 100644
index 00000000000..21b48e16efb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-repeat.ll
@@ -0,0 +1,53 @@
+; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: uwtable
+; When tail-duplicating during placement, we work backward from blocks with
+; multiple successors. In this case, the block dup1 gets duplicated into dup2
+; and if.then64, and then the block dup2 gets duplicated into land.lhs.true
+; and if.end70
+; CHECK-LABEL: repeated_tail_dup:
+define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6) #0 align 2 {
+entry:
+  br label %for.cond
+
+; CHECK: {{^}}.[[HEADER:LBB0_[1-9]]]: # %for.cond
+for.cond:                                         ; preds = %dup1, %entry
+  br i1 %a1, label %land.lhs.true, label %if.end56
+
+land.lhs.true:                                    ; preds = %for.cond
+  store i32 10, i32* %a4, align 8
+  br label %dup2
+
+if.end56:                                         ; preds = %for.cond
+  br i1 %a2, label %if.then64, label %if.end70
+
+if.then64:                                        ; preds = %if.end56
+  store i8 1, i8* %a6, align 1
+  br label %dup1
+
+; CHECK:      # %if.end70
+; CHECK-NEXT: # in Loop:
+; CHECK-NEXT: movl $12, (%rdx)
+; CHECK-NEXT: movl $2, (%rcx)
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: je .[[HEADER]]
+if.end70:                                         ; preds = %if.end56
+  store i32 12, i32* %a4, align 8
+  br label %dup2
+
+dup2:                                             ; preds = %if.end70, %land.lhs.true
+  store i32 2, i32* %a5, align 4
+  br label %dup1
+
+dup1:                                             ; preds = %dup2, %if.then64
+  %val = load i32, i32* %a4, align 8
+  %switch = icmp ult i32 undef, 1
+  br i1 %switch, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %dup1
+  ret void
+}
+
+attributes #0 = { uwtable }
diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir
index 1e75c6af9eb..2e8e85b4ef6 100644
--- a/llvm/test/CodeGen/X86/update-terminator.mir
+++ b/llvm/test/CodeGen/X86/update-terminator.mir
@@ -5,17 +5,30 @@
   @a = external global i16
   @b = external global i32
 
+  declare void @dummy1()
+  declare void @dummy2()
+  declare void @dummy3()
+
   ; Function Attrs: nounwind
   define void @f2() {
     br i1 undef, label %bb1, label %bb3
 
   bb1:
+    call void @dummy1()
+    call void @dummy1()
+    call void @dummy1()
     br i1 undef, label %bb2, label %bb2
 
   bb2:
+    call void @dummy2()
+    call void @dummy2()
+    call void @dummy2()
     br label %bb4
 
   bb3:
+    call void @dummy3()
+    call void @dummy3()
+    call void @dummy3()
     br label %bb2
 
   bb4:
@@ -40,15 +53,24 @@ body:             |
   bb.1:
     successors: %bb.2(100)
 
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp
     JNE_1 %bb.2, implicit %eflags
 
   bb.2:
     successors: %bb.4(100)
 
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.4
 
   bb.3:
     successors: %bb.2(100)
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
+    CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp
     JMP_1 %bb.2
 
   bb.4:
author	Kyle Butt <kyle+llvm@iteratee.net>	2016-10-07 22:33:20 +0000
committer	Kyle Butt <kyle+llvm@iteratee.net>	2016-10-07 22:33:20 +0000
commit	37e676d85762d8541e2df16bfa14963f5c705118 (patch)
tree	229b8aee0fa9fd60d44e45d86a1bc073eedb81ec /llvm/test/CodeGen
parent	609e669e1afd91a00260aad5f4ba5230e75161e3 (diff)
download	bcm5719-llvm-37e676d85762d8541e2df16bfa14963f5c705118.tar.gz bcm5719-llvm-37e676d85762d8541e2df16bfa14963f5c705118.zip