diff options
author | Reid Kleckner <rnk@google.com> | 2017-03-01 21:42:00 +0000 |
---|---|---|
committer | Reid Kleckner <rnk@google.com> | 2017-03-01 21:42:00 +0000 |
commit | f7c0980c10cf76466fdff28770ba6cca837710eb (patch) | |
tree | 9f33d9c91f64ffe51335c741bce01bf1f992dd4d /llvm/test/CodeGen | |
parent | b7278af54bf3c056de5702fbad4bff07fb28b6fa (diff) | |
download | bcm5719-llvm-f7c0980c10cf76466fdff28770ba6cca837710eb.tar.gz bcm5719-llvm-f7c0980c10cf76466fdff28770ba6cca837710eb.zip |
Elide argument copies during instruction selection
Summary:
Avoids tons of prologue boilerplate when arguments are passed in memory
and left in memory. This can happen in a debug build or in a release
build when an argument alloca is escaped. This will dramatically affect
the code size of x86 debug builds, because X86 fast isel doesn't handle
arguments passed in memory at all. It only handles the x86_64 case of up
to 6 basic register parameters.
This is implemented by analyzing the entry block before ISel to identify
copy elision candidates. A copy elision candidate is an argument that is
used to fully initialize an alloca before any other possibly escaping
uses of that alloca. If an argument is a copy elision candidate, we set
a flag on the InputArg. If the the target generates loads from a fixed
stack object that matches the size and alignment requirements of the
alloca, the SelectionDAG builder will delete the stack object created
for the alloca and replace it with the fixed stack object. The load is
left behind to satisfy any remaining uses of the argument value. The
store is now dead and is therefore elided. The fixed stack object is
also marked as mutable, as it may now be modified by the user, and it
would be invalid to rematerialize the initial load from it.
Supersedes D28388
Fixes PR26328
Reviewers: chandlerc, MatzeB, qcolombet, inglorion, hans
Subscribers: igorb, llvm-commits
Differential Revision: https://reviews.llvm.org/D29668
llvm-svn: 296683
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/arg-copy-elide.ll | 61 | ||||
-rw-r--r-- | llvm/test/CodeGen/Mips/o32_cc_vararg.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/arg-copy-elide.ll | 280 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/inline-asm-tied.ll | 30 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr30430.ll | 56 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 8 |
8 files changed, 392 insertions, 58 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll index a29f8c4b57a..0a796557148 100644 --- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -3,7 +3,7 @@ ; rdar://13625505 ; Here we have 9 fixed integer arguments the 9th argument in on stack, the ; varargs start right after at 8-byte alignment. -define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp { +define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp { ; CHECK-LABEL: fn9: ; 9th fixed argument ; CHECK: ldr {{w[0-9]+}}, [sp, #64] @@ -30,7 +30,6 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, %a10 = alloca i32, align 4 %a11 = alloca i32, align 4 %a12 = alloca i32, align 4 - store i32 %a1, i32* %1, align 4 store i32 %a2, i32* %2, align 4 store i32 %a3, i32* %3, align 4 store i32 %a4, i32* %4, align 4 @@ -39,6 +38,7 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, store i32 %a7, i32* %7, align 4 store i32 %a8, i32* %8, align 4 store i32 %a9, i32* %9, align 4 + store i32 %a9, i32* %a1 %10 = bitcast i8** %args to i8* call void @llvm.va_start(i8* %10) %11 = va_arg i8** %args, i32 @@ -93,7 +93,7 @@ define i32 @main() nounwind ssp { %10 = load i32, i32* %a10, align 4 %11 = load i32, i32* %a11, align 4 %12 = load i32, i32* %a12, align 4 - call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) + call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) ret i32 0 } diff --git a/llvm/test/CodeGen/ARM/arg-copy-elide.ll b/llvm/test/CodeGen/ARM/arg-copy-elide.ll new file mode 100644 index 00000000000..739b560b083 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arg-copy-elide.ll @@ -0,0 +1,61 @@ +; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s + +declare arm_aapcscc void @addrof_i32(i32*) +declare arm_aapcscc void @addrof_i64(i64*) + +define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret void +} + +; CHECK-LABEL: simple: +; CHECK: push {r11, lr} +; CHECK: add r0, sp, #8 +; CHECK: bl addrof_i32 +; CHECK: pop {r11, pc} + + +; We need to load %x before calling addrof_i32 now because it could mutate %x in +; place. + +define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret i32 %x +} + +; CHECK-LABEL: use_arg: +; CHECK: push {[[csr:[^ ]*]], lr} +; CHECK: ldr [[csr]], [sp, #8] +; CHECK: add r0, sp, #8 +; CHECK: bl addrof_i32 +; CHECK: mov r0, [[csr]] +; CHECK: pop {[[csr]], pc} + + +define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) { +entry: + %x.addr = alloca i64, align 4 + store i64 %x, i64* %x.addr, align 4 + call void @addrof_i64(i64* %x.addr) + ret i64 %x +} + +; CHECK-LABEL: split_i64: +; CHECK: push {r4, r5, r11, lr} +; CHECK: sub sp, sp, #8 +; CHECK: ldr r4, [sp, #28] +; CHECK: ldr r5, [sp, #24] +; CHECK: mov r0, sp +; CHECK: str r4, [sp, #4] +; CHECK: str r5, [sp] +; CHECK: bl addrof_i64 +; CHECK: mov r0, r5 +; CHECK: mov r1, r4 +; CHECK: add sp, sp, #8 +; CHECK: pop {r4, r5, r11, pc} diff --git a/llvm/test/CodeGen/Mips/o32_cc_vararg.ll b/llvm/test/CodeGen/Mips/o32_cc_vararg.ll index 80a1c648b78..73aad48b73e 100644 --- a/llvm/test/CodeGen/Mips/o32_cc_vararg.ll +++ b/llvm/test/CodeGen/Mips/o32_cc_vararg.ll @@ -236,8 +236,8 @@ entry: ret i32 %tmp ; CHECK-LABEL: va9: -; CHECK: addiu $sp, $sp, -32 -; CHECK: lw $2, 52($sp) +; CHECK: addiu $sp, $sp, -24 +; CHECK: lw $2, 44($sp) } ; double diff --git a/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll b/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll index fc5520e12ac..24abb719b0f 100644 --- a/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll +++ b/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll @@ -8,9 +8,10 @@ target triple = "i386-apple-darwin10.0.0" @.str = internal constant [4 x i8] c"%p\0A\00" ; <[4 x i8]*> [#uses=1] @llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.S*, i32, %struct.S*)* @_Z4test1SiS_ to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0] -; Verify that %esi gets spilled before the call. +; Verify that %s1 gets spilled before the call. ; CHECK: Z4test1SiS -; CHECK: movl %esi,{{.*}}(%ebp) +; CHECK: leal 8(%ebp), %[[reg:[^ ]*]] +; CHECK: movl %[[reg]],{{.*}}(%ebp) ## 4-byte Spill ; CHECK: calll __Z6throwsv define i8* @_Z4test1SiS_(%struct.S* byval %s1, i32 %n, %struct.S* byval %s2) ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll new file mode 100644 index 00000000000..15afb56ed69 --- /dev/null +++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll @@ -0,0 +1,280 @@ +; RUN: llc -mtriple=i686-windows < %s | FileCheck %s + +declare void @addrof_i32(i32*) +declare void @addrof_i64(i64*) +declare void @addrof_i128(i128*) +declare void @addrof_i32_x3(i32*, i32*, i32*) + +define void @simple(i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret void +} + +; CHECK-LABEL: _simple: +; CHECK: leal 4(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + + +; We need to load %x before calling addrof_i32 now because it could mutate %x in +; place. + +define i32 @use_arg(i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret i32 %x +} + +; CHECK-LABEL: _use_arg: +; CHECK: pushl %[[csr:[^ ]*]] +; CHECK-DAG: movl 8(%esp), %[[csr]] +; CHECK-DAG: leal 8(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: movl %[[csr]], %eax +; CHECK: popl %[[csr]] +; CHECK: retl + + +define i64 @split_i64(i64 %x) { +entry: + %x.addr = alloca i64, align 4 + store i64 %x, i64* %x.addr, align 4 + call void @addrof_i64(i64* %x.addr) + ret i64 %x +} + +; CHECK-LABEL: _split_i64: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: pushl %[[csr2:[^ ]*]] +; CHECK: pushl %[[csr1:[^ ]*]] +; CHECK: andl $-8, %esp +; CHECK-DAG: movl 8(%ebp), %[[csr1]] +; CHECK-DAG: movl 12(%ebp), %[[csr2]] +; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i64 +; CHECK-DAG: movl %[[csr1]], %eax +; CHECK-DAG: movl %[[csr2]], %edx +; CHECK: leal -8(%ebp), %esp +; CHECK: popl %[[csr1]] +; CHECK: popl %[[csr2]] +; CHECK: popl %ebp +; CHECK: retl + + +; We can't copy elide when an i64 is split between registers and memory in a +; fastcc function. + +define fastcc i64 @fastcc_split_i64(i64* %p, i64 %x) { +entry: + %x.addr = alloca i64, align 4 + store i64 %x, i64* %x.addr, align 4 + call void @addrof_i64(i64* %x.addr) + ret i64 %x +} + +; CHECK-LABEL: _fastcc_split_i64: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK-DAG: movl %edx, %[[r1:[^ ]*]] +; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]] +; CHECK-DAG: movl %[[r2]], 4(%esp) +; CHECK-DAG: movl %[[r1]], (%esp) +; CHECK: movl %esp, %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i64 +; CHECK: popl %ebp +; CHECK: retl + + +; We can't copy elide when it would reduce the user requested alignment. + +define void @high_alignment(i32 %x) { +entry: + %x.p = alloca i32, align 128 + store i32 %x, i32* %x.p + call void @addrof_i32(i32* %x.p) + ret void +} + +; CHECK-LABEL: _high_alignment: +; CHECK: andl $-128, %esp +; CHECK: movl 8(%ebp), %[[reg:[^ ]*]] +; CHECK: movl %[[reg]], (%esp) +; CHECK: movl %esp, %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + + +; We can't copy elide when it would reduce the ABI required alignment. +; FIXME: We should lower the ABI alignment of i64 on Windows, since MSVC +; doesn't guarantee it. + +define void @abi_alignment(i64 %x) { +entry: + %x.p = alloca i64 + store i64 %x, i64* %x.p + call void @addrof_i64(i64* %x.p) + ret void +} + +; CHECK-LABEL: _abi_alignment: +; CHECK: andl $-8, %esp +; CHECK: movl 8(%ebp), %[[reg:[^ ]*]] +; CHECK: movl %[[reg]], (%esp) +; CHECK: movl %esp, %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i64 +; CHECK: retl + + +; The code we generate for this is unimportant. This is mostly a crash test. + +define void @split_i128(i128* %sret, i128 %x) { +entry: + %x.addr = alloca i128 + store i128 %x, i128* %x.addr + call void @addrof_i128(i128* %x.addr) + store i128 %x, i128* %sret + ret void +} + +; CHECK-LABEL: _split_i128: +; CHECK: pushl %ebp +; CHECK: calll _addrof_i128 +; CHECK: retl + + +; Check that we load all of x, y, and z before the call. + +define i32 @three_args(i32 %x, i32 %y, i32 %z) { +entry: + %z.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + %x.addr = alloca i32, align 4 + store i32 %z, i32* %z.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + store i32 %x, i32* %x.addr, align 4 + call void @addrof_i32_x3(i32* %x.addr, i32* %y.addr, i32* %z.addr) + %s1 = add i32 %x, %y + %sum = add i32 %s1, %z + ret i32 %sum +} + +; CHECK-LABEL: _three_args: +; CHECK: pushl %[[csr:[^ ]*]] +; CHECK-DAG: movl {{[0-9]+}}(%esp), %[[csr]] +; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]] +; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]] +; CHECK-DAG: leal 8(%esp), %[[x:[^ ]*]] +; CHECK-DAG: leal 12(%esp), %[[y:[^ ]*]] +; CHECK-DAG: leal 16(%esp), %[[z:[^ ]*]] +; CHECK: pushl %[[z]] +; CHECK: pushl %[[y]] +; CHECK: pushl %[[x]] +; CHECK: calll _addrof_i32_x3 +; CHECK: movl %[[csr]], %eax +; CHECK: popl %[[csr]] +; CHECK: retl + + +define void @two_args_same_alloca(i32 %x, i32 %y) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + store i32 %y, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret void +} + +; CHECK-LABEL: _two_args_same_alloca: +; CHECK: movl 8(%esp), {{.*}} +; CHECK: movl {{.*}}, 4(%esp) +; CHECK: leal 4(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + + +define void @avoid_byval(i32* byval %x) { +entry: + %x.p.p = alloca i32* + store i32* %x, i32** %x.p.p + call void @addrof_i32(i32* %x) + ret void +} + +; CHECK-LABEL: _avoid_byval: +; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + + +define void @avoid_inalloca(i32* inalloca %x) { +entry: + %x.p.p = alloca i32* + store i32* %x, i32** %x.p.p + call void @addrof_i32(i32* %x) + ret void +} + +; CHECK-LABEL: _avoid_inalloca: +; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + + +; Don't elide the copy when the alloca is escaped with a store. + +define void @escape_with_store(i32 %x) { + %x1 = alloca i32 + %x2 = alloca i32* + store i32* %x1, i32** %x2 + %x3 = load i32*, i32** %x2 + store i32 0, i32* %x3 + store i32 %x, i32* %x1 + call void @addrof_i32(i32* %x1) + ret void +} + +; CHECK-LABEL: _escape_with_store: +; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]] +; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp) +; CHECK: movl %[[reg]], [[offs]](%esp) +; CHECK: calll _addrof_i32 + + +; This test case exposed issues with the use of TokenFactor. + +define void @sret_and_elide(i32* sret %sret, i32 %v) { + %v.p = alloca i32 + store i32 %v, i32* %v.p + call void @addrof_i32(i32* %v.p) + store i32 %v, i32* %sret + ret void +} + +; CHECK-LABEL: _sret_and_elide: +; CHECK: pushl +; CHECK: pushl +; CHECK: movl 12(%esp), %[[sret:[^ ]*]] +; CHECK: movl 16(%esp), %[[v:[^ ]*]] +; CHECK: leal 16(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: movl %[[v]], (%[[sret]]) +; CHECK: movl %[[sret]], %eax +; CHECK: popl +; CHECK: popl +; CHECK: retl diff --git a/llvm/test/CodeGen/X86/inline-asm-tied.ll b/llvm/test/CodeGen/X86/inline-asm-tied.ll index 25853579a4b..db63a804883 100644 --- a/llvm/test/CodeGen/X86/inline-asm-tied.ll +++ b/llvm/test/CodeGen/X86/inline-asm-tied.ll @@ -1,31 +1,27 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s ; rdar://6992609 -; CHECK: movl %ecx, 4([[ESP:%e..]]) -; CHECK: movl 4([[ESP]]), [[EDX:%e..]] -; CHECK: movl [[EDX]], 4([[ESP]]) target triple = "i386-apple-darwin9.0" -@llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0] define i64 @_OSSwapInt64(i64 %_data) nounwind { entry: - %retval = alloca i64 ; <i64*> [#uses=2] - %_data.addr = alloca i64 ; <i64*> [#uses=4] - store i64 %_data, i64* %_data.addr - %tmp = load i64, i64* %_data.addr ; <i64> [#uses=1] - %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %tmp) nounwind ; <i64> [#uses=1] - store i64 %0, i64* %_data.addr - %tmp1 = load i64, i64* %_data.addr ; <i64> [#uses=1] - store i64 %tmp1, i64* %retval - %1 = load i64, i64* %retval ; <i64> [#uses=1] - ret i64 %1 + %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %%edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %_data) nounwind + ret i64 %0 } +; CHECK-LABEL: __OSSwapInt64: +; CHECK-DAG: movl 8(%esp), %edx +; CHECK-DAG: movl 4(%esp), %eax +; CHECK: ## InlineAsm Start +; CHECK: ## InlineAsm End +; Everything is set up in EAX:EDX, return immediately. +; CHECK-NEXT: retl + ; The tied operands are not necessarily in the same order as the defs. ; PR13742 define i64 @swapped(i64 %x, i64 %y) nounwind { entry: - %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind - %x1 = extractvalue { i64, i64 } %x0, 0 - ret i64 %x1 + %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind + %x1 = extractvalue { i64, i64 } %x0, 0 + ret i64 %x1 } diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll index 6aa4c91c4a8..14d81f14fc3 100644 --- a/llvm/test/CodeGen/X86/pr30430.ll +++ b/llvm/test/CodeGen/X86/pr30430.ll @@ -30,14 +30,6 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm8, (%rsp) ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -46,14 +38,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm2, {{[0-9]+}}(%rsp) @@ -62,14 +54,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm16, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm17, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm18, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm19, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm20, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm21, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm22, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovss %xmm23, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] @@ -104,11 +96,19 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: # implicit-def: %YMM3 ; CHECK-NEXT: vmovaps %xmm1, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; CHECK-NEXT: # implicit-def: %ZMM16 -; CHECK-NEXT: vmovaps %zmm3, %zmm16 -; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm16, %zmm16 -; CHECK-NEXT: vmovaps %zmm16, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: %ZMM24 +; CHECK-NEXT: vmovaps %zmm3, %zmm24 +; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm24, %zmm24 +; CHECK-NEXT: vmovaps %zmm24, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 +; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 18434546262..7f040dd1a7f 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1653,12 +1653,8 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind { define void @test_mm_setcsr(i32 %a0) nounwind { ; X32-LABEL: test_mm_setcsr: ; X32: # BB#0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %esp, %ecx -; X32-NEXT: movl %eax, (%esp) -; X32-NEXT: ldmxcsr (%ecx) -; X32-NEXT: popl %eax +; X32-NEXT: leal 4(%esp), %eax +; X32-NEXT: ldmxcsr (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setcsr: |