summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
authorReid Kleckner <rnk@google.com>2017-03-01 21:42:00 +0000
committerReid Kleckner <rnk@google.com>2017-03-01 21:42:00 +0000
commitf7c0980c10cf76466fdff28770ba6cca837710eb (patch)
tree9f33d9c91f64ffe51335c741bce01bf1f992dd4d /llvm/test/CodeGen
parentb7278af54bf3c056de5702fbad4bff07fb28b6fa (diff)
downloadbcm5719-llvm-f7c0980c10cf76466fdff28770ba6cca837710eb.tar.gz
bcm5719-llvm-f7c0980c10cf76466fdff28770ba6cca837710eb.zip
Elide argument copies during instruction selection
Summary: Avoids tons of prologue boilerplate when arguments are passed in memory and left in memory. This can happen in a debug build or in a release build when an argument alloca is escaped. This will dramatically affect the code size of x86 debug builds, because X86 fast isel doesn't handle arguments passed in memory at all. It only handles the x86_64 case of up to 6 basic register parameters. This is implemented by analyzing the entry block before ISel to identify copy elision candidates. A copy elision candidate is an argument that is used to fully initialize an alloca before any other possibly escaping uses of that alloca. If an argument is a copy elision candidate, we set a flag on the InputArg. If the the target generates loads from a fixed stack object that matches the size and alignment requirements of the alloca, the SelectionDAG builder will delete the stack object created for the alloca and replace it with the fixed stack object. The load is left behind to satisfy any remaining uses of the argument value. The store is now dead and is therefore elided. The fixed stack object is also marked as mutable, as it may now be modified by the user, and it would be invalid to rematerialize the initial load from it. Supersedes D28388 Fixes PR26328 Reviewers: chandlerc, MatzeB, qcolombet, inglorion, hans Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D29668 llvm-svn: 296683
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll6
-rw-r--r--llvm/test/CodeGen/ARM/arg-copy-elide.ll61
-rw-r--r--llvm/test/CodeGen/Mips/o32_cc_vararg.ll4
-rw-r--r--llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll5
-rw-r--r--llvm/test/CodeGen/X86/arg-copy-elide.ll280
-rw-r--r--llvm/test/CodeGen/X86/inline-asm-tied.ll30
-rw-r--r--llvm/test/CodeGen/X86/pr30430.ll56
-rw-r--r--llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll8
8 files changed, 392 insertions, 58 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index a29f8c4b57a..0a796557148 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -3,7 +3,7 @@
; rdar://13625505
; Here we have 9 fixed integer arguments the 9th argument in on stack, the
; varargs start right after at 8-byte alignment.
-define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
; CHECK-LABEL: fn9:
; 9th fixed argument
; CHECK: ldr {{w[0-9]+}}, [sp, #64]
@@ -30,7 +30,6 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
%a10 = alloca i32, align 4
%a11 = alloca i32, align 4
%a12 = alloca i32, align 4
- store i32 %a1, i32* %1, align 4
store i32 %a2, i32* %2, align 4
store i32 %a3, i32* %3, align 4
store i32 %a4, i32* %4, align 4
@@ -39,6 +38,7 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
store i32 %a7, i32* %7, align 4
store i32 %a8, i32* %8, align 4
store i32 %a9, i32* %9, align 4
+ store i32 %a9, i32* %a1
%10 = bitcast i8** %args to i8*
call void @llvm.va_start(i8* %10)
%11 = va_arg i8** %args, i32
@@ -93,7 +93,7 @@ define i32 @main() nounwind ssp {
%10 = load i32, i32* %a10, align 4
%11 = load i32, i32* %a11, align 4
%12 = load i32, i32* %a12, align 4
- call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+ call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
ret i32 0
}
diff --git a/llvm/test/CodeGen/ARM/arg-copy-elide.ll b/llvm/test/CodeGen/ARM/arg-copy-elide.ll
new file mode 100644
index 00000000000..739b560b083
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arg-copy-elide.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s
+
+declare arm_aapcscc void @addrof_i32(i32*)
+declare arm_aapcscc void @addrof_i64(i64*)
+
+define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) {
+entry:
+ %x.addr = alloca i32
+ store i32 %x, i32* %x.addr
+ call void @addrof_i32(i32* %x.addr)
+ ret void
+}
+
+; CHECK-LABEL: simple:
+; CHECK: push {r11, lr}
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: pop {r11, pc}
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) {
+entry:
+ %x.addr = alloca i32
+ store i32 %x, i32* %x.addr
+ call void @addrof_i32(i32* %x.addr)
+ ret i32 %x
+}
+
+; CHECK-LABEL: use_arg:
+; CHECK: push {[[csr:[^ ]*]], lr}
+; CHECK: ldr [[csr]], [sp, #8]
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: mov r0, [[csr]]
+; CHECK: pop {[[csr]], pc}
+
+
+define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) {
+entry:
+ %x.addr = alloca i64, align 4
+ store i64 %x, i64* %x.addr, align 4
+ call void @addrof_i64(i64* %x.addr)
+ ret i64 %x
+}
+
+; CHECK-LABEL: split_i64:
+; CHECK: push {r4, r5, r11, lr}
+; CHECK: sub sp, sp, #8
+; CHECK: ldr r4, [sp, #28]
+; CHECK: ldr r5, [sp, #24]
+; CHECK: mov r0, sp
+; CHECK: str r4, [sp, #4]
+; CHECK: str r5, [sp]
+; CHECK: bl addrof_i64
+; CHECK: mov r0, r5
+; CHECK: mov r1, r4
+; CHECK: add sp, sp, #8
+; CHECK: pop {r4, r5, r11, pc}
diff --git a/llvm/test/CodeGen/Mips/o32_cc_vararg.ll b/llvm/test/CodeGen/Mips/o32_cc_vararg.ll
index 80a1c648b78..73aad48b73e 100644
--- a/llvm/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/llvm/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -236,8 +236,8 @@ entry:
ret i32 %tmp
; CHECK-LABEL: va9:
-; CHECK: addiu $sp, $sp, -32
-; CHECK: lw $2, 52($sp)
+; CHECK: addiu $sp, $sp, -24
+; CHECK: lw $2, 44($sp)
}
; double
diff --git a/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll b/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
index fc5520e12ac..24abb719b0f 100644
--- a/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
+++ b/llvm/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
@@ -8,9 +8,10 @@ target triple = "i386-apple-darwin10.0.0"
@.str = internal constant [4 x i8] c"%p\0A\00" ; <[4 x i8]*> [#uses=1]
@llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.S*, i32, %struct.S*)* @_Z4test1SiS_ to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
-; Verify that %esi gets spilled before the call.
+; Verify that %s1 gets spilled before the call.
; CHECK: Z4test1SiS
-; CHECK: movl %esi,{{.*}}(%ebp)
+; CHECK: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]],{{.*}}(%ebp) ## 4-byte Spill
; CHECK: calll __Z6throwsv
define i8* @_Z4test1SiS_(%struct.S* byval %s1, i32 %n, %struct.S* byval %s2) ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
new file mode 100644
index 00000000000..15afb56ed69
--- /dev/null
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -0,0 +1,280 @@
+; RUN: llc -mtriple=i686-windows < %s | FileCheck %s
+
+declare void @addrof_i32(i32*)
+declare void @addrof_i64(i64*)
+declare void @addrof_i128(i128*)
+declare void @addrof_i32_x3(i32*, i32*, i32*)
+
+define void @simple(i32 %x) {
+entry:
+ %x.addr = alloca i32
+ store i32 %x, i32* %x.addr
+ call void @addrof_i32(i32* %x.addr)
+ ret void
+}
+
+; CHECK-LABEL: _simple:
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define i32 @use_arg(i32 %x) {
+entry:
+ %x.addr = alloca i32
+ store i32 %x, i32* %x.addr
+ call void @addrof_i32(i32* %x.addr)
+ ret i32 %x
+}
+
+; CHECK-LABEL: _use_arg:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl 8(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+
+define i64 @split_i64(i64 %x) {
+entry:
+ %x.addr = alloca i64, align 4
+ store i64 %x, i64* %x.addr, align 4
+ call void @addrof_i64(i64* %x.addr)
+ ret i64 %x
+}
+
+; CHECK-LABEL: _split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %[[csr2:[^ ]*]]
+; CHECK: pushl %[[csr1:[^ ]*]]
+; CHECK: andl $-8, %esp
+; CHECK-DAG: movl 8(%ebp), %[[csr1]]
+; CHECK-DAG: movl 12(%ebp), %[[csr2]]
+; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK-DAG: movl %[[csr1]], %eax
+; CHECK-DAG: movl %[[csr2]], %edx
+; CHECK: leal -8(%ebp), %esp
+; CHECK: popl %[[csr1]]
+; CHECK: popl %[[csr2]]
+; CHECK: popl %ebp
+; CHECK: retl
+
+
+; We can't copy elide when an i64 is split between registers and memory in a
+; fastcc function.
+
+define fastcc i64 @fastcc_split_i64(i64* %p, i64 %x) {
+entry:
+ %x.addr = alloca i64, align 4
+ store i64 %x, i64* %x.addr, align 4
+ call void @addrof_i64(i64* %x.addr)
+ ret i64 %x
+}
+
+; CHECK-LABEL: _fastcc_split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
+; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
+; CHECK-DAG: movl %[[r2]], 4(%esp)
+; CHECK-DAG: movl %[[r1]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: popl %ebp
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the user requested alignment.
+
+define void @high_alignment(i32 %x) {
+entry:
+ %x.p = alloca i32, align 128
+ store i32 %x, i32* %x.p
+ call void @addrof_i32(i32* %x.p)
+ ret void
+}
+
+; CHECK-LABEL: _high_alignment:
+; CHECK: andl $-128, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the ABI required alignment.
+; FIXME: We should lower the ABI alignment of i64 on Windows, since MSVC
+; doesn't guarantee it.
+
+define void @abi_alignment(i64 %x) {
+entry:
+ %x.p = alloca i64
+ store i64 %x, i64* %x.p
+ call void @addrof_i64(i64* %x.p)
+ ret void
+}
+
+; CHECK-LABEL: _abi_alignment:
+; CHECK: andl $-8, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: retl
+
+
+; The code we generate for this is unimportant. This is mostly a crash test.
+
+define void @split_i128(i128* %sret, i128 %x) {
+entry:
+ %x.addr = alloca i128
+ store i128 %x, i128* %x.addr
+ call void @addrof_i128(i128* %x.addr)
+ store i128 %x, i128* %sret
+ ret void
+}
+
+; CHECK-LABEL: _split_i128:
+; CHECK: pushl %ebp
+; CHECK: calll _addrof_i128
+; CHECK: retl
+
+
+; Check that we load all of x, y, and z before the call.
+
+define i32 @three_args(i32 %x, i32 %y, i32 %z) {
+entry:
+ %z.addr = alloca i32, align 4
+ %y.addr = alloca i32, align 4
+ %x.addr = alloca i32, align 4
+ store i32 %z, i32* %z.addr, align 4
+ store i32 %y, i32* %y.addr, align 4
+ store i32 %x, i32* %x.addr, align 4
+ call void @addrof_i32_x3(i32* %x.addr, i32* %y.addr, i32* %z.addr)
+ %s1 = add i32 %x, %y
+ %sum = add i32 %s1, %z
+ ret i32 %sum
+}
+
+; CHECK-LABEL: _three_args:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[x:[^ ]*]]
+; CHECK-DAG: leal 12(%esp), %[[y:[^ ]*]]
+; CHECK-DAG: leal 16(%esp), %[[z:[^ ]*]]
+; CHECK: pushl %[[z]]
+; CHECK: pushl %[[y]]
+; CHECK: pushl %[[x]]
+; CHECK: calll _addrof_i32_x3
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+
+define void @two_args_same_alloca(i32 %x, i32 %y) {
+entry:
+ %x.addr = alloca i32
+ store i32 %x, i32* %x.addr
+ store i32 %y, i32* %x.addr
+ call void @addrof_i32(i32* %x.addr)
+ ret void
+}
+
+; CHECK-LABEL: _two_args_same_alloca:
+; CHECK: movl 8(%esp), {{.*}}
+; CHECK: movl {{.*}}, 4(%esp)
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_byval(i32* byval %x) {
+entry:
+ %x.p.p = alloca i32*
+ store i32* %x, i32** %x.p.p
+ call void @addrof_i32(i32* %x)
+ ret void
+}
+
+; CHECK-LABEL: _avoid_byval:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_inalloca(i32* inalloca %x) {
+entry:
+ %x.p.p = alloca i32*
+ store i32* %x, i32** %x.p.p
+ call void @addrof_i32(i32* %x)
+ ret void
+}
+
+; CHECK-LABEL: _avoid_inalloca:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; Don't elide the copy when the alloca is escaped with a store.
+
+define void @escape_with_store(i32 %x) {
+ %x1 = alloca i32
+ %x2 = alloca i32*
+ store i32* %x1, i32** %x2
+ %x3 = load i32*, i32** %x2
+ store i32 0, i32* %x3
+ store i32 %x, i32* %x1
+ call void @addrof_i32(i32* %x1)
+ ret void
+}
+
+; CHECK-LABEL: _escape_with_store:
+; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
+; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: calll _addrof_i32
+
+
+; This test case exposed issues with the use of TokenFactor.
+
+define void @sret_and_elide(i32* sret %sret, i32 %v) {
+ %v.p = alloca i32
+ store i32 %v, i32* %v.p
+ call void @addrof_i32(i32* %v.p)
+ store i32 %v, i32* %sret
+ ret void
+}
+
+; CHECK-LABEL: _sret_and_elide:
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: movl 12(%esp), %[[sret:[^ ]*]]
+; CHECK: movl 16(%esp), %[[v:[^ ]*]]
+; CHECK: leal 16(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[v]], (%[[sret]])
+; CHECK: movl %[[sret]], %eax
+; CHECK: popl
+; CHECK: popl
+; CHECK: retl
diff --git a/llvm/test/CodeGen/X86/inline-asm-tied.ll b/llvm/test/CodeGen/X86/inline-asm-tied.ll
index 25853579a4b..db63a804883 100644
--- a/llvm/test/CodeGen/X86/inline-asm-tied.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-tied.ll
@@ -1,31 +1,27 @@
; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
; rdar://6992609
-; CHECK: movl %ecx, 4([[ESP:%e..]])
-; CHECK: movl 4([[ESP]]), [[EDX:%e..]]
-; CHECK: movl [[EDX]], 4([[ESP]])
target triple = "i386-apple-darwin9.0"
-@llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
define i64 @_OSSwapInt64(i64 %_data) nounwind {
entry:
- %retval = alloca i64 ; <i64*> [#uses=2]
- %_data.addr = alloca i64 ; <i64*> [#uses=4]
- store i64 %_data, i64* %_data.addr
- %tmp = load i64, i64* %_data.addr ; <i64> [#uses=1]
- %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %tmp) nounwind ; <i64> [#uses=1]
- store i64 %0, i64* %_data.addr
- %tmp1 = load i64, i64* %_data.addr ; <i64> [#uses=1]
- store i64 %tmp1, i64* %retval
- %1 = load i64, i64* %retval ; <i64> [#uses=1]
- ret i64 %1
+ %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %%edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %_data) nounwind
+ ret i64 %0
}
+; CHECK-LABEL: __OSSwapInt64:
+; CHECK-DAG: movl 8(%esp), %edx
+; CHECK-DAG: movl 4(%esp), %eax
+; CHECK: ## InlineAsm Start
+; CHECK: ## InlineAsm End
+; Everything is set up in EAX:EDX, return immediately.
+; CHECK-NEXT: retl
+
; The tied operands are not necessarily in the same order as the defs.
; PR13742
define i64 @swapped(i64 %x, i64 %y) nounwind {
entry:
- %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
- %x1 = extractvalue { i64, i64 } %x0, 0
- ret i64 %x1
+ %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+ %x1 = extractvalue { i64, i64 } %x0, 0
+ ret i64 %x1
}
diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll
index 6aa4c91c4a8..14d81f14fc3 100644
--- a/llvm/test/CodeGen/X86/pr30430.ll
+++ b/llvm/test/CodeGen/X86/pr30430.ll
@@ -30,14 +30,6 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm8, (%rsp)
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -46,14 +38,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm2, {{[0-9]+}}(%rsp)
@@ -62,14 +54,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm17, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm18, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm19, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm20, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm21, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm22, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: vmovss %xmm23, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -104,11 +96,19 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: # implicit-def: %YMM3
; CHECK-NEXT: vmovaps %xmm1, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3
-; CHECK-NEXT: # implicit-def: %ZMM16
-; CHECK-NEXT: vmovaps %zmm3, %zmm16
-; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm16, %zmm16
-; CHECK-NEXT: vmovaps %zmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: %ZMM24
+; CHECK-NEXT: vmovaps %zmm3, %zmm24
+; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
+; CHECK-NEXT: vmovaps %zmm24, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
+; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 18434546262..7f040dd1a7f 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1653,12 +1653,8 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
define void @test_mm_setcsr(i32 %a0) nounwind {
; X32-LABEL: test_mm_setcsr:
; X32: # BB#0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %esp, %ecx
-; X32-NEXT: movl %eax, (%esp)
-; X32-NEXT: ldmxcsr (%ecx)
-; X32-NEXT: popl %eax
+; X32-NEXT: leal 4(%esp), %eax
+; X32-NEXT: ldmxcsr (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setcsr:
OpenPOWER on IntegriCloud