diff options
Diffstat (limited to 'llvm/test')
20 files changed, 595 insertions, 47 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll index 92ce2a04589..5dd8cb28232 100644 --- a/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll +++ b/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll @@ -12,7 +12,6 @@ bb1: %tmp2 = load i16, i16* %ptr, align 2 br label %bb2 bb2: -; CHECK: %bb2 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff ; CHECK: cmp [[REG]], #23 %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ] diff --git a/llvm/test/CodeGen/AArch64/machine_cse.ll b/llvm/test/CodeGen/AArch64/machine_cse.ll index 032199e6218..e9fa68041d9 100644 --- a/llvm/test/CodeGen/AArch64/machine_cse.ll +++ b/llvm/test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 diff --git a/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll b/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll new file mode 100644 index 00000000000..c2997c50f4d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll @@ -0,0 +1,69 @@ +; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct.s1 = type { %struct.s3*, %struct.s1* } +%struct.s2 = type opaque +%struct.s3 = type { i32 } + +; Function Attrs: nounwind +define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 { +entry: + br label %while.cond.outer + +; The loop gets laid out: +; %while.cond.outer +; %(null) +; %(null) +; %dup2 +; and then %dup1 gets chosen as the next block. +; when dup2 is duplicated into dup1, %worklist could erroneously be placed on +; the worklist, because all of its current predecessors are now scheduled. +; However, after dup2 is tail-duplicated, %worklist can't be on the worklist +; because it now has unscheduled predecessors.q +; CHECK-LABEL: repeated_dup_worklist +; CHECK: // %entry +; CHECK: // %while.cond.outer +; first %(null) block +; CHECK: // in Loop: +; CHECK: ldr +; CHECK-NEXT: tbnz +; second %(null) block +; CHECK: // in Loop: +; CHECK: // %dup2 +; CHECK: // %worklist +; CHECK: // %if.then96.i +while.cond.outer: ; preds = %dup1, %entry + %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ] + %inc77 = add nsw i32 %progress.0.ph, 1 + %cmp = icmp slt i32 %progress.0.ph, %i32_1 + br i1 %cmp, label %dup2, label %dup1 + +dup2: ; preds = %if.then96.i, %worklist, %while.cond.outer + %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ] + %.pr = load %struct.s1*, %struct.s1** %pp1, align 8 + br label %dup1 + +dup1: ; preds = %dup2, %while.cond.outer + %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ] + %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ] + br i1 %i1_1, label %while.cond.outer, label %worklist + +worklist: ; preds = %dup1 + %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0 + %1 = load %struct.s3*, %struct.s3** %snode94, align 8 + %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0 + %3 = load i32, i32* %2, align 4 + %tobool95.i = icmp eq i32 %3, 0 + br i1 %tobool95.i, label %if.then96.i, label %dup2 + +if.then96.i: ; preds = %worklist + call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1 + br label %dup2 +} + +; Function Attrs: nounwind +declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll b/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll index 15b7e9410fc..83c7676e57e 100644 --- a/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll +++ b/llvm/test/CodeGen/ARM/2011-03-23-PeepholeBug.ll @@ -25,7 +25,6 @@ bb1: ; preds = %bb br label %bb2 bb2: ; preds = %bb1, %entry -; CHECK: bb2 ; CHECK: cmp [[REG]], #0 ; CHECK: ble %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ] diff --git a/llvm/test/CodeGen/PowerPC/branch-opt.ll b/llvm/test/CodeGen/PowerPC/branch-opt.ll index b3c0dba8b85..e714972b17e 100644 --- a/llvm/test/CodeGen/PowerPC/branch-opt.ll +++ b/llvm/test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,21 @@ -; RUN: llc -verify-machineinstrs < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc -verify-machineinstrs < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +; There are 4 inner loops (%bb, %bb12, %bb25, %bb38) that all exit to %cond_next48 +; The last (whichever it is) should have a fallthrough exit, and the other three +; need an unconditional branch. No other block should have an unconditional +; branch to cond_next48 +; One of the blocks ends up with a loop exit block that gets a tail-duplicated copy +; of %cond_next48, so there should only be two unconditional branches. + +;CHECK: b LBB0_13 +;CHECK: b LBB0_13 +;CHECK-NOT: b LBB0_13 +;CHECK: LBB0_13: ; %cond_next48 + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; <i32> [#uses=1] diff --git a/llvm/test/CodeGen/PowerPC/sjlj.ll b/llvm/test/CodeGen/PowerPC/sjlj.ll index 7fe31384675..f86f5345f87 100644 --- a/llvm/test/CodeGen/PowerPC/sjlj.ll +++ b/llvm/test/CodeGen/PowerPC/sjlj.ll @@ -74,24 +74,24 @@ return: ; preds = %if.end, %if.then ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31) # 8-byte Folded Spill ; CHECK-DAG: std 1, 16([[REGA]]) ; CHECK-DAG: std 2, 24([[REGA]]) -; CHECK: bcl 20, 31, .LBB1_5 +; CHECK: bcl 20, 31, .LBB1_3 ; CHECK: li 3, 1 -; CHECK: #EH_SjLj_Setup .LBB1_5 +; CHECK: #EH_SjLj_Setup .LBB1_3 ; CHECK: b .LBB1_1 -; CHECK: .LBB1_4: +; CHECK: .LBB1_3: +; CHECK: mflr [[REGL:[0-9]+]] +; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload +; CHECK: std [[REGL]], 8([[REG2]]) +; CHECK: li 3, 0 + +; CHECK: .LBB1_5: ; CHECK: lfd ; CHECK: lxvd2x ; CHECK: ld ; CHECK: blr -; CHECK: .LBB1_5: -; CHECK: mflr [[REGL:[0-9]+]] -; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31) # 8-byte Folded Reload -; CHECK: std [[REGL]], 8([[REG2]]) -; CHECK: li 3, 0 - ; CHECK-NOAV: @main ; CHECK-NOAV-NOT: stxvd2x ; CHECK-NOAV: bcl diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll b/llvm/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll new file mode 100644 index 00000000000..5d03af801fc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll @@ -0,0 +1,65 @@ +; RUN: llc -O2 %s -o - | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +declare void @f1() +declare void @f2() +declare void @f3() +declare void @f4() + +; Function Attrs: nounwind +; CHECK-LABEL: tail_dup_fallthrough_with_branch +; CHECK: # %entry +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %entry +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.0 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.1 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.default +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %if.then +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %if.else +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: .Lfunc_end0 +define fastcc void @tail_dup_fallthrough_with_branch(i32 %a, i1 %b) unnamed_addr #0 { +entry: + switch i32 %a, label %sw.default [ + i32 0, label %sw.0 + i32 1, label %sw.1 + ] + +sw.0: ; preds = %entry + call void @f1() #0 + br label %dup1 + +sw.1: ; preds = %entry + call void @f2() #0 + br label %dup1 + +sw.default: ; preds = %entry + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %sw.default + call void @f3() #0 + br label %dup2 + +if.else: ; preds = %sw.default + call void @f4() #0 + br label %dup2 + +dup1: ; preds = %sw.0, %sw.1 + call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0 + unreachable + +dup2: ; preds = %if.then, %if.else + call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0 + unreachable +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll new file mode 100644 index 00000000000..6790aa8e944 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,100 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The outlining flag produces the layout +; test1 +; test2 +; test3 +; test4 +; exit +; optional1 +; optional2 +; optional3 +; optional4 +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: f: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %exit +exit: + ret void +} + +declare void @a() +declare void @b() +declare void @c() +declare void @d() diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll index eb1f2368344..fcdbd7fdc57 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass. diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll index 71787feb77d..0ac1e1e182c 100644 --- a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s ; Test memcpy, memmove, and memset intrinsics. diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll index b690316d531..39e29640724 100644 --- a/llvm/test/CodeGen/X86/block-placement.ll +++ b/llvm/test/CodeGen/X86/block-placement.ll @@ -177,6 +177,12 @@ exit: ret i32 %sum } +; Tail duplication during layout can entirely remove body0 by duplicating it +; into the entry block and into body1. This is a good thing but it isn't what +; this test is looking for. So to make the blocks longer so they don't get +; duplicated, we add some calls to dummy. +declare void @dummy() + define i32 @test_loop_rotate(i32 %i, i32* %a) { ; Check that we rotate conditional exits from the loop to the bottom of the ; loop, eliminating unconditional branches to the top. @@ -194,6 +200,8 @@ body0: %base = phi i32 [ 0, %entry ], [ %sum, %body1 ] %next = add i32 %iv, 1 %exitcond = icmp eq i32 %next, %i + call void @dummy() + call void @dummy() br i1 %exitcond, label %exit, label %body1 body1: @@ -945,7 +953,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) { ; First rotated loop top. ; CHECK: .p2align ; CHECK: %while.end -; CHECK: %for.cond +; %for.cond gets completely tail-duplicated away. ; CHECK: %if.then ; CHECK: %if.else ; CHECK: %if.end10 diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll index c0c6fc4ac22..6e4762b2e79 100644 --- a/llvm/test/CodeGen/X86/cmov-into-branch.ll +++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll @@ -105,9 +105,11 @@ define i32 @weighted_select3(i32 %a, i32 %b) { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je [[LABEL_BB6:.*]] ; CHECK: movl %edi, %eax +; CHECK-NEXT: retq ; CHECK: [[LABEL_BB6]] ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: jmp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq ; %cmp = icmp ne i32 %a, 0 %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2 diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll index 8d0318bb93e..78e7471b886 100644 --- a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: fmaddsubpd_loop_128: ; CHECK: vfmaddsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -28,7 +28,7 @@ for.end: ; CHECK-LABEL: fmsubaddpd_loop_128: ; CHECK: vfmsubadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -54,7 +54,7 @@ for.end: ; CHECK-LABEL: fmaddpd_loop_128: ; CHECK: vfmadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -80,7 +80,7 @@ for.end: ; CHECK-LABEL: fmsubpd_loop_128: ; CHECK: vfmsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -106,7 +106,7 @@ for.end: ; CHECK-LABEL: fnmaddpd_loop_128: ; CHECK: vfnmadd231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -132,7 +132,7 @@ for.end: ; CHECK-LABEL: fnmsubpd_loop_128: ; CHECK: vfnmsub231pd %xmm1, %xmm0, %xmm2 -; CHECK: vmovaps %xmm2, %xmm0 +; CHECK: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { entry: @@ -329,7 +329,7 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa ; CHECK-LABEL: fmaddsubpd_loop_256: ; CHECK: vfmaddsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -355,7 +355,7 @@ for.end: ; CHECK-LABEL: fmsubaddpd_loop_256: ; CHECK: vfmsubadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -381,7 +381,7 @@ for.end: ; CHECK-LABEL: fmaddpd_loop_256: ; CHECK: vfmadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -407,7 +407,7 @@ for.end: ; CHECK-LABEL: fmsubpd_loop_256: ; CHECK: vfmsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -433,7 +433,7 @@ for.end: ; CHECK-LABEL: fnmaddpd_loop_256: ; CHECK: vfnmadd231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: @@ -459,7 +459,7 @@ for.end: ; CHECK-LABEL: fnmsubpd_loop_256: ; CHECK: vfnmsub231pd %ymm1, %ymm0, %ymm2 -; CHECK: vmovaps %ymm2, %ymm0 +; CHECK: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: diff --git a/llvm/test/CodeGen/X86/fp-une-cmp.ll b/llvm/test/CodeGen/X86/fp-une-cmp.ll index 653040053c2..e3b2a04060b 100644 --- a/llvm/test/CodeGen/X86/fp-une-cmp.ll +++ b/llvm/test/CodeGen/X86/fp-une-cmp.ll @@ -56,11 +56,11 @@ define double @profile_metadata(double %x, double %y) { ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: jp .LBB1_1 -; CHECK-NEXT: .LBB1_2: # %bb2 +; CHECK-NEXT: # %bb2 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %bb1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: retq entry: %mul = fmul double %x, %y diff --git a/llvm/test/CodeGen/X86/pr11202.ll b/llvm/test/CodeGen/X86/pr11202.ll index 13070d1c600..cb1a749d91f 100644 --- a/llvm/test/CodeGen/X86/pr11202.ll +++ b/llvm/test/CodeGen/X86/pr11202.ll @@ -15,5 +15,8 @@ l2: ; preds = %l1 br label %l1 } -; CHECK: .Ltmp0: # Address of block that was removed by CodeGen +; It is correct for either l1 or l2 to be removed. +; If l2 is removed, the message should be "Address of block that was removed by CodeGen" +; If l1 is removed, it should be "Block address taken." +; CHECK: .Ltmp0: # {{Address of block that was removed by CodeGen|Block address taken}} ; CHECK: .quad .Ltmp0 diff --git a/llvm/test/CodeGen/X86/ragreedy-bug.ll b/llvm/test/CodeGen/X86/ragreedy-bug.ll index e8426317f13..bfeb041f89a 100644 --- a/llvm/test/CodeGen/X86/ragreedy-bug.ll +++ b/llvm/test/CodeGen/X86/ragreedy-bug.ll @@ -3,16 +3,34 @@ ; This testing case is reduced from 197.parser prune_match function. ; We make sure register copies are not generated on isupper.exit blocks. -; CHECK: isupper.exit +; isupper.exit and isupper.exit223 get tail-duplicated into all their +; predecessors. +; CHECK: cond.true.i.i ; CHECK-NEXT: in Loop +; Mem-move +; CHECK-NEXT: movl +; CHECK-NEXT: andl ; CHECK-NEXT: testl ; CHECK-NEXT: jne -; CHECK: isupper.exit +; CHECK: cond.true.i.i217 ; CHECK-NEXT: in Loop +; Mem-move +; CHECK-NEXT: movl +; CHECK-NEXT: andl ; CHECK-NEXT: testl ; CHECK-NEXT: je +; CHECK: cond.false.i.i ; CHECK: maskrune +; CHECK-NEXT: movzbl +; CHECK-NEXT: movzbl +; CHECK-NEXT: testl +; CHECK-NEXT: je +; CHECK: cond.false.i.i219 ; CHECK: maskrune +; CHECK-NEXT: movzbl +; CHECK-NEXT: movzbl +; CHECK-NEXT: testl +; CHECK-NEXT: jne %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* } diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll index 3ac6ea6e2b8..beedb1d2465 100644 --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -58,21 +58,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: je .LBB1_1 ; X32-NEXT: # BB#2: # %entry ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: jmp .LBB1_3 +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_5 +; X32-NEXT: jmp .LBB1_4 ; X32-NEXT: .LBB1_1: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: .LBB1_3: # %entry ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB1_4 -; X32-NEXT: # BB#5: # %entry +; X32-NEXT: .LBB1_5: # %entry ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: jmp .LBB1_6 +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: jne .LBB1_8 +; X32-NEXT: jmp .LBB1_7 ; X32-NEXT: .LBB1_4: ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: .LBB1_6: # %entry ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: je .LBB1_7 -; X32-NEXT: # BB#8: # %entry +; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: jmp .LBB1_9 ; X32-NEXT: .LBB1_7: @@ -95,21 +97,23 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: je .LBB1_1 ; X64-NEXT: # BB#2: # %entry ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: jmp .LBB1_3 +; X64-NEXT: testl %edx, %edx +; X64-NEXT: jne .LBB1_5 +; X64-NEXT: jmp .LBB1_4 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: .LBB1_3: # %entry ; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB1_4 -; X64-NEXT: # BB#5: # %entry +; X64-NEXT: .LBB1_5: # %entry ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: jmp .LBB1_6 +; X64-NEXT: testl %r8d, %r8d +; X64-NEXT: jne .LBB1_8 +; X64-NEXT: jmp .LBB1_7 ; X64-NEXT: .LBB1_4: ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB1_6: # %entry ; X64-NEXT: testl %r8d, %r8d ; X64-NEXT: je .LBB1_7 -; X64-NEXT: # BB#8: # %entry +; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 ; X64-NEXT: jmp .LBB1_9 ; X64-NEXT: .LBB1_7: diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll new file mode 100644 index 00000000000..2c8c05b6a91 --- /dev/null +++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -0,0 +1,190 @@ +; RUN: llc -O2 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +; CHECK-LABEL: tail_dup_merge_loops +; CHECK: # %entry +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_exit +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_latch +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_test +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %exit +define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 { +entry: + %notlhs674.i = icmp eq i32 %a, 0 + br label %outer_loop_top + +outer_loop_top: ; preds = %inner_loop_exit, %entry + %dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ] + br i1 %notlhs674.i, label %exit, label %inner_loop_preheader + +inner_loop_preheader: ; preds = %outer_loop_top + br label %inner_loop_top + +inner_loop_top: ; preds = %inner_loop_latch, %inner_loop_preheader + %dst.0.i = phi i8* [ %inc, %inner_loop_latch ], [ %dst.0.ph.i, %inner_loop_preheader ] + %var = load i8, i8* %dst.0.i + %tobool1.i = icmp slt i8 %var, 0 + br label %inner_loop_test + +inner_loop_test: ; preds = %inner_loop_top + br i1 %tobool1.i, label %inner_loop_exit, label %inner_loop_latch + +inner_loop_exit: ; preds = %inner_loop_test + %scevgep.i = getelementptr i8, i8* %dst.0.i, i64 1 + %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 0 + br label %outer_loop_top + +inner_loop_latch: ; preds = %inner_loop_test + %cmp75.i = icmp ult i8* %dst.0.i, %c + %inc = getelementptr i8, i8* %dst.0.i, i64 2 + br label %inner_loop_top + +exit: ; preds = %outer_loop_top + ret void +} + +@.str.6 = external unnamed_addr constant [23 x i8], align 1 + +; There is an erroneus check in LoopBase::addBasicBlockToLoop(), where it +; assumes that the header block for a loop is unique. +; For most of compilation this assumption is true, but during layout we allow +; this assumption to be violated. The following code will trigger the bug: + +; The loops in question is eventually headed by the block shared_loop_header +; +; During layout The block labeled outer_loop_header gets tail-duplicated into +; outer_loop_latch, and into shared_preheader, and then removed. This leaves +; shared_loop_header as the header of both loops. The end result +; is that there are 2 valid loops, and that they share a header. If we re-ran +; the loop analysis, it would classify this as a single loop. +; So far this is fine as far as layout is concerned. +; After layout we tail merge blocks merge_other and merge_predecessor_split. +; We do this even though they share only a single instruction, because +; merge_predecessor_split falls through to their shared successor: +; outer_loop_latch. +; The rest of the blocks in the function are noise unfortunately. Bugpoint +; couldn't shrink the test any further. + +; CHECK-LABEL: loop_shared_header +; CHECK: # %entry +; CHECK: # %shared_preheader +; CHECK: # %shared_loop_header +; CHECK: # %inner_loop_body +; CHECK: # %merge_predecessor_split +; CHECK: # %outer_loop_latch +; CHECK: # %outer_loop_latch +; CHECK: # %cleanup +define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* undef, align 4 + %mul = shl nsw i32 %0, 2 + br i1 undef, label %if.end19, label %cleanup + +if.end19: ; preds = %entry + %conv = zext i32 %mul to i64 + %call = tail call i8* @cli_calloc(i64 %conv, i64 1) + %1 = icmp eq i32 %exesz, 0 + %notrhs = icmp eq i32 %0, 0 + %or.cond117.not = or i1 %1, %notrhs + %or.cond202 = or i1 %or.cond117.not, undef + %cmp35 = icmp ult i8* undef, %exe + %or.cond203 = or i1 %or.cond202, %cmp35 + br i1 %or.cond203, label %cleanup, label %if.end50 + +if.end50: ; preds = %if.end19 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %call, i8* undef, i64 %conv, i32 1, i1 false) + %cmp1.i.i = icmp ugt i32 %mul, 3 + br i1 %cmp1.i.i, label %shared_preheader, label %wunpsect.exit.thread.loopexit391 + +shared_preheader: ; preds = %if.end50 + br label %outer_loop_header + +outer_loop_header: ; preds = %outer_loop_latch, %shared_preheader + %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ] + %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ] + %2 = icmp eq i32 undef, 0 + br i1 %2, label %while.cond.us1412.i, label %shared_loop_header + +while.cond.us1412.i: ; preds = %outer_loop_header + %.pre.i = add i8 %bits.1.i, -1 + %tobool2.us1420.i = icmp eq i8 %.pre.i, 0 + %or.cond.us1421.i = or i1 undef, %tobool2.us1420.i + br i1 %or.cond.us1421.i, label %if.end41.us1436.i, label %cleanup + +if.end41.us1436.i: ; preds = %while.cond.us1412.i + unreachable + +shared_loop_header: ; preds = %dup_early2, %dup_early1 + %dst.0.i = phi i8* [ undef, %inner_loop_body ], [ %dst.0.ph.i, %outer_loop_header ], [ undef, %dead_block ] + %cmp3.i1172.i = icmp ult i8* undef, %call + br i1 %cmp3.i1172.i, label %wunpsect.exit.thread.loopexit389, label %inner_loop_body + +inner_loop_body: ; preds = %shared_loop_header + %3 = icmp slt i32 undef, 0 + br i1 %3, label %if.end96.i, label %shared_loop_header + +dead_block: ; preds = %inner_loop_body + %cmp75.i = icmp ult i8* %dst.0.i, undef + br label %shared_loop_header + +if.end96.i: ; preds = %inner_loop_body + %cmp97.i = icmp ugt i32 undef, 2 + br i1 %cmp97.i, label %if.then99.i, label %if.end287.i + +if.then99.i: ; preds = %if.end96.i + tail call void (i8*, ...) @cli_dbgmsg(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.6, i64 0, i64 0), i32 undef) + br label %cleanup + +if.end287.i: ; preds = %if.end96.i + %cmp291.i = icmp ne i32 undef, 1 + %conv294.i = select i1 %cmp291.i, i16 4, i16 3 + br i1 undef, label %if.end308.i, label %outer_loop_latch + +if.end308.i: ; preds = %if.end287.i + br i1 undef, label %if.end335.i, label %merge_predecessor_split + +merge_predecessor_split: ; preds = %if.end308.i + %4 = bitcast i8* undef to i32* + br label %outer_loop_latch + +if.end335.i: ; preds = %if.end308.i + br i1 undef, label %outer_loop_latch, label %merge_other + +merge_other: ; preds = %if.end335.i + br label %outer_loop_latch + +outer_loop_latch: ; preds = %merge_other, %if.end335.i, %merge_predecessor_split, %if.end287.i + %bits.43.i = phi i8 [ undef, %if.end287.i ], [ undef, %merge_other ], [ 32, %merge_predecessor_split ], [ 0, %if.end335.i ] + %backsize.0.i = phi i16 [ %conv294.i, %if.end287.i ], [ 0, %merge_other ], [ 0, %merge_predecessor_split ], [ 0, %if.end335.i ] + %5 = add i16 %backsize.0.i, -1 + %6 = zext i16 %5 to i64 + %scevgep.i = getelementptr i8, i8* %dst.0.ph.i, i64 1 + %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 %6 + br label %outer_loop_header + +wunpsect.exit.thread.loopexit389: ; preds = %shared_loop_header + unreachable + +wunpsect.exit.thread.loopexit391: ; preds = %if.end50 + unreachable + +cleanup: ; preds = %if.then99.i, %while.cond.us1412.i, %if.end19, %entry + %retval.0 = phi i32 [ 0, %if.then99.i ], [ 1, %entry ], [ 1, %if.end19 ], [ 1, %while.cond.us1412.i ] + ret i32 %retval.0 +} + +; Function Attrs: nounwind +declare void @cli_dbgmsg(i8*, ...) local_unnamed_addr #0 + +; Function Attrs: nounwind +declare i8* @cli_calloc(i64, i64) local_unnamed_addr #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/CodeGen/X86/tail-dup-repeat.ll b/llvm/test/CodeGen/X86/tail-dup-repeat.ll new file mode 100644 index 00000000000..21b48e16efb --- /dev/null +++ b/llvm/test/CodeGen/X86/tail-dup-repeat.ll @@ -0,0 +1,53 @@ +; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: uwtable +; When tail-duplicating during placement, we work backward from blocks with +; multiple successors. In this case, the block dup1 gets duplicated into dup2 +; and if.then64, and then the block dup2 gets duplicated into land.lhs.true +; and if.end70 +; CHECK-LABEL: repeated_tail_dup: +define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6) #0 align 2 { +entry: + br label %for.cond + +; CHECK: {{^}}.[[HEADER:LBB0_[1-9]]]: # %for.cond +for.cond: ; preds = %dup1, %entry + br i1 %a1, label %land.lhs.true, label %if.end56 + +land.lhs.true: ; preds = %for.cond + store i32 10, i32* %a4, align 8 + br label %dup2 + +if.end56: ; preds = %for.cond + br i1 %a2, label %if.then64, label %if.end70 + +if.then64: ; preds = %if.end56 + store i8 1, i8* %a6, align 1 + br label %dup1 + +; CHECK: # %if.end70 +; CHECK-NEXT: # in Loop: +; CHECK-NEXT: movl $12, (%rdx) +; CHECK-NEXT: movl $2, (%rcx) +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: je .[[HEADER]] +if.end70: ; preds = %if.end56 + store i32 12, i32* %a4, align 8 + br label %dup2 + +dup2: ; preds = %if.end70, %land.lhs.true + store i32 2, i32* %a5, align 4 + br label %dup1 + +dup1: ; preds = %dup2, %if.then64 + %val = load i32, i32* %a4, align 8 + %switch = icmp ult i32 undef, 1 + br i1 %switch, label %for.cond, label %for.end + +for.end: ; preds = %dup1 + ret void +} + +attributes #0 = { uwtable } diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir index 1e75c6af9eb..2e8e85b4ef6 100644 --- a/llvm/test/CodeGen/X86/update-terminator.mir +++ b/llvm/test/CodeGen/X86/update-terminator.mir @@ -5,17 +5,30 @@ @a = external global i16 @b = external global i32 + declare void @dummy1() + declare void @dummy2() + declare void @dummy3() + ; Function Attrs: nounwind define void @f2() { br i1 undef, label %bb1, label %bb3 bb1: + call void @dummy1() + call void @dummy1() + call void @dummy1() br i1 undef, label %bb2, label %bb2 bb2: + call void @dummy2() + call void @dummy2() + call void @dummy2() br label %bb4 bb3: + call void @dummy3() + call void @dummy3() + call void @dummy3() br label %bb2 bb4: @@ -40,15 +53,24 @@ body: | bb.1: successors: %bb.2(100) + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy1, csr_64, implicit %rsp, implicit-def %rsp JNE_1 %bb.2, implicit %eflags bb.2: successors: %bb.4(100) + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy2, csr_64, implicit %rsp, implicit-def %rsp JMP_1 %bb.4 bb.3: successors: %bb.2(100) + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp + CALL64pcrel32 @dummy3, csr_64, implicit %rsp, implicit-def %rsp JMP_1 %bb.2 bb.4: |